@inproceedings{zhou-etal-2026-unveiling,
title = "Unveiling Inherent Visual Grounding in Multimodal {LLM}s for Text-Rich Images",
author = "Zhou, Shijie and
Kil, Jihyung and
Li, Ming and
Gu, Jiuxiang and
Wigington, Curtis and
Jain, Rajiv and
Chen, Changyou and
Zhang, Ruiyi",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.16/",
pages = "352--370",
ISBN = "979-8-89176-395-1",
abstract = "Visual text grounding provides interpretable evidence for document question answering. Due to the complex layouts and mixed visual-text contents in text-rich images, effective visual text grounding requires strong visual and spatial reasoning to localize multiple referenced regions. Existing multimodal large language model (MLLM) approaches often struggle to align query tokens with visual{--}text patches, heavily relying on lengthy OCR inputs. To tackle this problem, we propose Doc-AGround, an OCR-free approach that leverages the MLLM{'}s inherent multi-head attention for multi-patch grounding. Doc-AGround extracts a patch-wise attention map as the grounding prediction. Concurrently, it introduces an effective multi-head weighting mechanism to amplify the attention heads' intrinsic role in connecting vision and text. Empirical results of Doc-AGround show state-of-the-art performance on challenging document grounding benchmarks, demonstrating the effectiveness of the proposed attention-based grounding design."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhou-etal-2026-unveiling">
<titleInfo>
<title>Unveiling Inherent Visual Grounding in Multimodal LLMs for Text-Rich Images</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shijie</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jihyung</namePart>
<namePart type="family">Kil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiuxiang</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Curtis</namePart>
<namePart type="family">Wigington</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajiv</namePart>
<namePart type="family">Jain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Changyou</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruiyi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Visual text grounding provides interpretable evidence for document question answering. Due to the complex layouts and mixed visual-text contents in text-rich images, effective visual text grounding requires strong visual and spatial reasoning to localize multiple referenced regions. Existing multimodal large language model (MLLM) approaches often struggle to align query tokens with visual–text patches, heavily relying on lengthy OCR inputs. To tackle this problem, we propose Doc-AGround, an OCR-free approach that leverages the MLLM’s inherent multi-head attention for multi-patch grounding. Doc-AGround extracts a patch-wise attention map as the grounding prediction. Concurrently, it introduces an effective multi-head weighting mechanism to amplify the attention heads’ intrinsic role in connecting vision and text. Empirical results of Doc-AGround show state-of-the-art performance on challenging document grounding benchmarks, demonstrating the effectiveness of the proposed attention-based grounding design.</abstract>
<identifier type="citekey">zhou-etal-2026-unveiling</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.16/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>352</start>
<end>370</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Unveiling Inherent Visual Grounding in Multimodal LLMs for Text-Rich Images
%A Zhou, Shijie
%A Kil, Jihyung
%A Li, Ming
%A Gu, Jiuxiang
%A Wigington, Curtis
%A Jain, Rajiv
%A Chen, Changyou
%A Zhang, Ruiyi
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F zhou-etal-2026-unveiling
%X Visual text grounding provides interpretable evidence for document question answering. Due to the complex layouts and mixed visual-text contents in text-rich images, effective visual text grounding requires strong visual and spatial reasoning to localize multiple referenced regions. Existing multimodal large language model (MLLM) approaches often struggle to align query tokens with visual–text patches, heavily relying on lengthy OCR inputs. To tackle this problem, we propose Doc-AGround, an OCR-free approach that leverages the MLLM’s inherent multi-head attention for multi-patch grounding. Doc-AGround extracts a patch-wise attention map as the grounding prediction. Concurrently, it introduces an effective multi-head weighting mechanism to amplify the attention heads’ intrinsic role in connecting vision and text. Empirical results of Doc-AGround show state-of-the-art performance on challenging document grounding benchmarks, demonstrating the effectiveness of the proposed attention-based grounding design.
%U https://aclanthology.org/2026.findings-acl.16/
%P 352-370
Markdown (Informal)
[Unveiling Inherent Visual Grounding in Multimodal LLMs for Text-Rich Images](https://aclanthology.org/2026.findings-acl.16/) (Zhou et al., Findings 2026)
ACL
- Shijie Zhou, Jihyung Kil, Ming Li, Jiuxiang Gu, Curtis Wigington, Rajiv Jain, Changyou Chen, and Ruiyi Zhang. 2026. Unveiling Inherent Visual Grounding in Multimodal LLMs for Text-Rich Images. In Findings of the Association for Computational Linguistics: ACL 2026, pages 352–370, San Diego, California, United States. Association for Computational Linguistics.