@inproceedings{sarkar-etal-2025-mitigating,
title = "Mitigating Hallucinations in Vision-Language Models through Image-Guided Head Suppression",
author = "Sarkar, Sreetama and
Che, Yue and
Gavin, Alex and
Beerel, Peter Anthony and
Kundu, Souvik",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.631/",
pages = "12492--12511",
ISBN = "979-8-89176-332-6",
abstract = "Despite their remarkable progress in multimodal understanding tasks, large vision language models (LVLMs) often suffer from ``hallucination'', generating texts misaligned with the visual context. Existing methods aimed at reducing hallucinations through inference time intervention incur a significant increase in latency. To mitigate this, we present **SPIN**, a task-agnostic attention-guided head suppression strategy that can be seamlessly integrated during inference **without incurring any significant compute or latency overhead**. We investigate whether hallucination in LVLMs can be linked to specific model components. Our analysis suggests that hallucinations can be attributed to a dynamic subset of attention heads in each layer. Leveraging this insight, for each text query token, we selectively suppress attention heads that exhibit low attention to image tokens, keeping the top-k attention heads intact. Extensive evaluations on visual question answering and image description tasks demonstrate the efficacy of SPIN in reducing hallucination scores up to **2.7x** while maintaining F1, and improving throughput by **1.8x** compared to existing alternatives."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sarkar-etal-2025-mitigating">
<titleInfo>
<title>Mitigating Hallucinations in Vision-Language Models through Image-Guided Head Suppression</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sreetama</namePart>
<namePart type="family">Sarkar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Gavin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="given">Anthony</namePart>
<namePart type="family">Beerel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Souvik</namePart>
<namePart type="family">Kundu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Despite their remarkable progress in multimodal understanding tasks, large vision language models (LVLMs) often suffer from “hallucination”, generating texts misaligned with the visual context. Existing methods aimed at reducing hallucinations through inference time intervention incur a significant increase in latency. To mitigate this, we present **SPIN**, a task-agnostic attention-guided head suppression strategy that can be seamlessly integrated during inference **without incurring any significant compute or latency overhead**. We investigate whether hallucination in LVLMs can be linked to specific model components. Our analysis suggests that hallucinations can be attributed to a dynamic subset of attention heads in each layer. Leveraging this insight, for each text query token, we selectively suppress attention heads that exhibit low attention to image tokens, keeping the top-k attention heads intact. Extensive evaluations on visual question answering and image description tasks demonstrate the efficacy of SPIN in reducing hallucination scores up to **2.7x** while maintaining F1, and improving throughput by **1.8x** compared to existing alternatives.</abstract>
<identifier type="citekey">sarkar-etal-2025-mitigating</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.631/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>12492</start>
<end>12511</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mitigating Hallucinations in Vision-Language Models through Image-Guided Head Suppression
%A Sarkar, Sreetama
%A Che, Yue
%A Gavin, Alex
%A Beerel, Peter Anthony
%A Kundu, Souvik
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F sarkar-etal-2025-mitigating
%X Despite their remarkable progress in multimodal understanding tasks, large vision language models (LVLMs) often suffer from “hallucination”, generating texts misaligned with the visual context. Existing methods aimed at reducing hallucinations through inference time intervention incur a significant increase in latency. To mitigate this, we present **SPIN**, a task-agnostic attention-guided head suppression strategy that can be seamlessly integrated during inference **without incurring any significant compute or latency overhead**. We investigate whether hallucination in LVLMs can be linked to specific model components. Our analysis suggests that hallucinations can be attributed to a dynamic subset of attention heads in each layer. Leveraging this insight, for each text query token, we selectively suppress attention heads that exhibit low attention to image tokens, keeping the top-k attention heads intact. Extensive evaluations on visual question answering and image description tasks demonstrate the efficacy of SPIN in reducing hallucination scores up to **2.7x** while maintaining F1, and improving throughput by **1.8x** compared to existing alternatives.
%U https://aclanthology.org/2025.emnlp-main.631/
%P 12492-12511
Markdown (Informal)
[Mitigating Hallucinations in Vision-Language Models through Image-Guided Head Suppression](https://aclanthology.org/2025.emnlp-main.631/) (Sarkar et al., EMNLP 2025)
ACL