@inproceedings{prabhakaran-etal-2025-vade,
title = "{VADE}: Visual Attention Guided Hallucination Detection and Elimination",
author = "Prabhakaran, Vishnu and
Aggarwal, Purav and
Verma, Vinay Kumar and
Swamy, Gokul and
Saladi, Anoop",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.773/",
doi = "10.18653/v1/2025.findings-acl.773",
pages = "14949--14965",
ISBN = "979-8-89176-256-5",
abstract = "Vision Language Models (VLMs) have achieved significant advancements in complex visual understanding tasks. However, VLMs are prone to hallucinations{---}generating outputs that lack alignment with visual content. This paper addresses hallucination detection in VLMs by leveraging the visual grounding information encoded in transformer attention maps. We identify three primary challenges in this approach: the elective nature of visual grounding for certain tokens, the high-dimensional and noisy nature of attention maps, and the dynamic sequence length of attention on previous tokens. To address these, we propose VADE, a novel sequence modelling approach to effectively learn complex sequential patterns from high-dimensional and noisy attention maps for fine-grained hallucination detection and mitigation. VADE achieves an average PR-AUC of 80{\%} in hallucination detection on M-HalDetect across four different model architectures and an 5{\%} improvement in hallucination mitigation on MSCOCO."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="prabhakaran-etal-2025-vade">
<titleInfo>
<title>VADE: Visual Attention Guided Hallucination Detection and Elimination</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vishnu</namePart>
<namePart type="family">Prabhakaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Purav</namePart>
<namePart type="family">Aggarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vinay</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Verma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gokul</namePart>
<namePart type="family">Swamy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Saladi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Vision Language Models (VLMs) have achieved significant advancements in complex visual understanding tasks. However, VLMs are prone to hallucinations—generating outputs that lack alignment with visual content. This paper addresses hallucination detection in VLMs by leveraging the visual grounding information encoded in transformer attention maps. We identify three primary challenges in this approach: the elective nature of visual grounding for certain tokens, the high-dimensional and noisy nature of attention maps, and the dynamic sequence length of attention on previous tokens. To address these, we propose VADE, a novel sequence modelling approach to effectively learn complex sequential patterns from high-dimensional and noisy attention maps for fine-grained hallucination detection and mitigation. VADE achieves an average PR-AUC of 80% in hallucination detection on M-HalDetect across four different model architectures and an 5% improvement in hallucination mitigation on MSCOCO.</abstract>
<identifier type="citekey">prabhakaran-etal-2025-vade</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.773</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.773/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>14949</start>
<end>14965</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T VADE: Visual Attention Guided Hallucination Detection and Elimination
%A Prabhakaran, Vishnu
%A Aggarwal, Purav
%A Verma, Vinay Kumar
%A Swamy, Gokul
%A Saladi, Anoop
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F prabhakaran-etal-2025-vade
%X Vision Language Models (VLMs) have achieved significant advancements in complex visual understanding tasks. However, VLMs are prone to hallucinations—generating outputs that lack alignment with visual content. This paper addresses hallucination detection in VLMs by leveraging the visual grounding information encoded in transformer attention maps. We identify three primary challenges in this approach: the elective nature of visual grounding for certain tokens, the high-dimensional and noisy nature of attention maps, and the dynamic sequence length of attention on previous tokens. To address these, we propose VADE, a novel sequence modelling approach to effectively learn complex sequential patterns from high-dimensional and noisy attention maps for fine-grained hallucination detection and mitigation. VADE achieves an average PR-AUC of 80% in hallucination detection on M-HalDetect across four different model architectures and an 5% improvement in hallucination mitigation on MSCOCO.
%R 10.18653/v1/2025.findings-acl.773
%U https://aclanthology.org/2025.findings-acl.773/
%U https://doi.org/10.18653/v1/2025.findings-acl.773
%P 14949-14965
Markdown (Informal)
[VADE: Visual Attention Guided Hallucination Detection and Elimination](https://aclanthology.org/2025.findings-acl.773/) (Prabhakaran et al., Findings 2025)
ACL