@inproceedings{hu-etal-2024-embedding,
title = "Embedding and Gradient Say Wrong: A White-Box Method for Hallucination Detection",
author = "Hu, Xiaomeng and
Zhang, Yiming and
Peng, Ru and
Zhang, Haozhe and
Wu, Chenwei and
Chen, Gang and
Zhao, Junbo",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.116",
pages = "1950--1959",
abstract = "In recent years, large language models (LLMs) have achieved remarkable success in the field of natural language generation. Compared to previous small-scale models, they are capable of generating fluent output based on the provided prefix or prompt. However, one critical challenge {---} the *hallucination* problem {---} remains to be resolved. Generally, the community refers to the undetected hallucination scenario where the LLMs generate text unrelated to the input text or facts. In this study, we intend to model the distributional distance between the regular conditional output and the unconditional output, which is generated without a given input text. Based upon Taylor Expansion for this distance at the output probability space, our approach manages to leverage the embedding and first-order gradient information. The resulting approach is plug-and-play that can be easily adapted to any autoregressive LLM. On the hallucination benchmarks HADES and other datasets, our approach achieves state-of-the-art performance.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hu-etal-2024-embedding">
<titleInfo>
<title>Embedding and Gradient Say Wrong: A White-Box Method for Hallucination Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiaomeng</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yiming</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ru</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haozhe</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenwei</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junbo</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In recent years, large language models (LLMs) have achieved remarkable success in the field of natural language generation. Compared to previous small-scale models, they are capable of generating fluent output based on the provided prefix or prompt. However, one critical challenge — the *hallucination* problem — remains to be resolved. Generally, the community refers to the undetected hallucination scenario where the LLMs generate text unrelated to the input text or facts. In this study, we intend to model the distributional distance between the regular conditional output and the unconditional output, which is generated without a given input text. Based upon Taylor Expansion for this distance at the output probability space, our approach manages to leverage the embedding and first-order gradient information. The resulting approach is plug-and-play that can be easily adapted to any autoregressive LLM. On the hallucination benchmarks HADES and other datasets, our approach achieves state-of-the-art performance.</abstract>
<identifier type="citekey">hu-etal-2024-embedding</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.116</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>1950</start>
<end>1959</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Embedding and Gradient Say Wrong: A White-Box Method for Hallucination Detection
%A Hu, Xiaomeng
%A Zhang, Yiming
%A Peng, Ru
%A Zhang, Haozhe
%A Wu, Chenwei
%A Chen, Gang
%A Zhao, Junbo
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F hu-etal-2024-embedding
%X In recent years, large language models (LLMs) have achieved remarkable success in the field of natural language generation. Compared to previous small-scale models, they are capable of generating fluent output based on the provided prefix or prompt. However, one critical challenge — the *hallucination* problem — remains to be resolved. Generally, the community refers to the undetected hallucination scenario where the LLMs generate text unrelated to the input text or facts. In this study, we intend to model the distributional distance between the regular conditional output and the unconditional output, which is generated without a given input text. Based upon Taylor Expansion for this distance at the output probability space, our approach manages to leverage the embedding and first-order gradient information. The resulting approach is plug-and-play that can be easily adapted to any autoregressive LLM. On the hallucination benchmarks HADES and other datasets, our approach achieves state-of-the-art performance.
%U https://aclanthology.org/2024.emnlp-main.116
%P 1950-1959
Markdown (Informal)
[Embedding and Gradient Say Wrong: A White-Box Method for Hallucination Detection](https://aclanthology.org/2024.emnlp-main.116) (Hu et al., EMNLP 2024)
ACL
- Xiaomeng Hu, Yiming Zhang, Ru Peng, Haozhe Zhang, Chenwei Wu, Gang Chen, and Junbo Zhao. 2024. Embedding and Gradient Say Wrong: A White-Box Method for Hallucination Detection. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 1950–1959, Miami, Florida, USA. Association for Computational Linguistics.