@inproceedings{zhang-etal-2025-shallow,
title = "Shallow Focus, Deep Fixes: Enhancing Shallow Layers Vision Attention Sinks to Alleviate Hallucination in {LVLM}s",
author = "Zhang, Xiaofeng and
Quan, Yihao and
Shen, Chen and
Gu, Chaochen and
Yuan, Xiaosong and
Yan, Shaotian and
Cao, Jiawei and
Cheng, Hao and
Wu, Kaijie and
Ye, Jieping",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.174/",
doi = "10.18653/v1/2025.emnlp-main.174",
pages = "3512--3534",
ISBN = "979-8-89176-332-6",
abstract = "Multimodal large language models (MLLMs) demonstrate excellent abilities for understanding visual information, while the hallucination remains. Albeit image tokens constitute the majority of the MLLMs input, the relation between image tokens and hallucinations is still unexplored. In this paper, we analyze the attention score distribution of image tokens across layers and attention heads in models, revealing an intriguing but common phenomenon: most hallucinations are closely linked to the attention sink patterns of image tokens attention matrix, where shallow layers exhibit dense sinks and deep layers exhibit the sparse. We further explore the attention heads of different layers, finding: heads with high-density attention sink of the image part act positively in mitigating hallucinations. Inspired by these findings, we propose a training-free approach called Enhancing Vision Attention Sinks (EVAS) to facilitate the convergence of the image token attention sink within shallow layers. Specifically, EVAS identifies the attention heads that emerge as the densest visual sink in shallow layers and extracts its attention matrix, which is then broadcast to other heads of the same layer, thereby strengthing the layer{'}s focus on the image itself. Extensive empirical results of various MLLMs illustrate the superior performance of the proposed EVAS, demonstrating its effectiveness and generality."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2025-shallow">
<titleInfo>
<title>Shallow Focus, Deep Fixes: Enhancing Shallow Layers Vision Attention Sinks to Alleviate Hallucination in LVLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiaofeng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yihao</namePart>
<namePart type="family">Quan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chaochen</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaosong</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shaotian</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiawei</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaijie</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jieping</namePart>
<namePart type="family">Ye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Multimodal large language models (MLLMs) demonstrate excellent abilities for understanding visual information, while the hallucination remains. Albeit image tokens constitute the majority of the MLLMs input, the relation between image tokens and hallucinations is still unexplored. In this paper, we analyze the attention score distribution of image tokens across layers and attention heads in models, revealing an intriguing but common phenomenon: most hallucinations are closely linked to the attention sink patterns of image tokens attention matrix, where shallow layers exhibit dense sinks and deep layers exhibit the sparse. We further explore the attention heads of different layers, finding: heads with high-density attention sink of the image part act positively in mitigating hallucinations. Inspired by these findings, we propose a training-free approach called Enhancing Vision Attention Sinks (EVAS) to facilitate the convergence of the image token attention sink within shallow layers. Specifically, EVAS identifies the attention heads that emerge as the densest visual sink in shallow layers and extracts its attention matrix, which is then broadcast to other heads of the same layer, thereby strengthing the layer’s focus on the image itself. Extensive empirical results of various MLLMs illustrate the superior performance of the proposed EVAS, demonstrating its effectiveness and generality.</abstract>
<identifier type="citekey">zhang-etal-2025-shallow</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.174</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.174/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>3512</start>
<end>3534</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Shallow Focus, Deep Fixes: Enhancing Shallow Layers Vision Attention Sinks to Alleviate Hallucination in LVLMs
%A Zhang, Xiaofeng
%A Quan, Yihao
%A Shen, Chen
%A Gu, Chaochen
%A Yuan, Xiaosong
%A Yan, Shaotian
%A Cao, Jiawei
%A Cheng, Hao
%A Wu, Kaijie
%A Ye, Jieping
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F zhang-etal-2025-shallow
%X Multimodal large language models (MLLMs) demonstrate excellent abilities for understanding visual information, while the hallucination remains. Albeit image tokens constitute the majority of the MLLMs input, the relation between image tokens and hallucinations is still unexplored. In this paper, we analyze the attention score distribution of image tokens across layers and attention heads in models, revealing an intriguing but common phenomenon: most hallucinations are closely linked to the attention sink patterns of image tokens attention matrix, where shallow layers exhibit dense sinks and deep layers exhibit the sparse. We further explore the attention heads of different layers, finding: heads with high-density attention sink of the image part act positively in mitigating hallucinations. Inspired by these findings, we propose a training-free approach called Enhancing Vision Attention Sinks (EVAS) to facilitate the convergence of the image token attention sink within shallow layers. Specifically, EVAS identifies the attention heads that emerge as the densest visual sink in shallow layers and extracts its attention matrix, which is then broadcast to other heads of the same layer, thereby strengthing the layer’s focus on the image itself. Extensive empirical results of various MLLMs illustrate the superior performance of the proposed EVAS, demonstrating its effectiveness and generality.
%R 10.18653/v1/2025.emnlp-main.174
%U https://aclanthology.org/2025.emnlp-main.174/
%U https://doi.org/10.18653/v1/2025.emnlp-main.174
%P 3512-3534
Markdown (Informal)
[Shallow Focus, Deep Fixes: Enhancing Shallow Layers Vision Attention Sinks to Alleviate Hallucination in LVLMs](https://aclanthology.org/2025.emnlp-main.174/) (Zhang et al., EMNLP 2025)
ACL
- Xiaofeng Zhang, Yihao Quan, Chen Shen, Chaochen Gu, Xiaosong Yuan, Shaotian Yan, Jiawei Cao, Hao Cheng, Kaijie Wu, and Jieping Ye. 2025. Shallow Focus, Deep Fixes: Enhancing Shallow Layers Vision Attention Sinks to Alleviate Hallucination in LVLMs. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 3512–3534, Suzhou, China. Association for Computational Linguistics.