@inproceedings{jiang-etal-2026-global,
title = "Global Context or Local Detail? Adaptive Visual Grounding for Hallucination Mitigation",
author = "Jiang, Yubo and
Yang, Xin and
Wuerkaixi, Abudukelimu and
Yuan, Zheming and
Cheng, Xuxin and
Liu, Cao and
Zeng, Ke and
Xie, Fengying and
Jiang, Zhiguo and
Zhang, Haopeng",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.745/",
pages = "15133--15152",
ISBN = "979-8-89176-395-1",
abstract = "Large vision{--}language models (LVLMs) excel at multimodal reasoning but still suffer from object-existence hallucinations when multi-step deliberation decouples from visual evidence. Think-with-Images (TwI) attempts to counter this by generating auxiliary observations (e.g., zoomed crops or highlighted views), yet it is not reliably beneficial. We identify two coupled failure modes: (1) a granularity{--}context trade-off of common operators, where zoom-in improves local detail but breaks global relations, while highlighting preserves topology but lacks fine evidence; and (2) an over-trust issue in tool-guided region proposals, where mislocalized evidence can dominate reasoning and even underperform standard prompting. We propose Active-Look, a training-free, plug-and-play TwI framework that allocates visual computation by uncertainty. Active-Look runs two heterogeneous grounding experts in parallel and uses their disagreement as a proxy for uncertainty, spending the budget only to verify disputed regions. It further mitigates the operator trade-off with conflict-aware hybrid rendering: highlighting retains global context, while selective zoom-in performs local verification. Experiments on hallucination-focused and general benchmarks (POPE, MME, and CHAIR) across multiple LVLM backbones show consistent gains."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jiang-etal-2026-global">
<titleInfo>
<title>Global Context or Local Detail? Adaptive Visual Grounding for Hallucination Mitigation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yubo</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abudukelimu</namePart>
<namePart type="family">Wuerkaixi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheming</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuxin</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cao</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ke</namePart>
<namePart type="family">Zeng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fengying</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiguo</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haopeng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Large vision–language models (LVLMs) excel at multimodal reasoning but still suffer from object-existence hallucinations when multi-step deliberation decouples from visual evidence. Think-with-Images (TwI) attempts to counter this by generating auxiliary observations (e.g., zoomed crops or highlighted views), yet it is not reliably beneficial. We identify two coupled failure modes: (1) a granularity–context trade-off of common operators, where zoom-in improves local detail but breaks global relations, while highlighting preserves topology but lacks fine evidence; and (2) an over-trust issue in tool-guided region proposals, where mislocalized evidence can dominate reasoning and even underperform standard prompting. We propose Active-Look, a training-free, plug-and-play TwI framework that allocates visual computation by uncertainty. Active-Look runs two heterogeneous grounding experts in parallel and uses their disagreement as a proxy for uncertainty, spending the budget only to verify disputed regions. It further mitigates the operator trade-off with conflict-aware hybrid rendering: highlighting retains global context, while selective zoom-in performs local verification. Experiments on hallucination-focused and general benchmarks (POPE, MME, and CHAIR) across multiple LVLM backbones show consistent gains.</abstract>
<identifier type="citekey">jiang-etal-2026-global</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.745/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>15133</start>
<end>15152</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Global Context or Local Detail? Adaptive Visual Grounding for Hallucination Mitigation
%A Jiang, Yubo
%A Yang, Xin
%A Wuerkaixi, Abudukelimu
%A Yuan, Zheming
%A Cheng, Xuxin
%A Liu, Cao
%A Zeng, Ke
%A Xie, Fengying
%A Jiang, Zhiguo
%A Zhang, Haopeng
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F jiang-etal-2026-global
%X Large vision–language models (LVLMs) excel at multimodal reasoning but still suffer from object-existence hallucinations when multi-step deliberation decouples from visual evidence. Think-with-Images (TwI) attempts to counter this by generating auxiliary observations (e.g., zoomed crops or highlighted views), yet it is not reliably beneficial. We identify two coupled failure modes: (1) a granularity–context trade-off of common operators, where zoom-in improves local detail but breaks global relations, while highlighting preserves topology but lacks fine evidence; and (2) an over-trust issue in tool-guided region proposals, where mislocalized evidence can dominate reasoning and even underperform standard prompting. We propose Active-Look, a training-free, plug-and-play TwI framework that allocates visual computation by uncertainty. Active-Look runs two heterogeneous grounding experts in parallel and uses their disagreement as a proxy for uncertainty, spending the budget only to verify disputed regions. It further mitigates the operator trade-off with conflict-aware hybrid rendering: highlighting retains global context, while selective zoom-in performs local verification. Experiments on hallucination-focused and general benchmarks (POPE, MME, and CHAIR) across multiple LVLM backbones show consistent gains.
%U https://aclanthology.org/2026.findings-acl.745/
%P 15133-15152
Markdown (Informal)
[Global Context or Local Detail? Adaptive Visual Grounding for Hallucination Mitigation](https://aclanthology.org/2026.findings-acl.745/) (Jiang et al., Findings 2026)
ACL
- Yubo Jiang, Xin Yang, Abudukelimu Wuerkaixi, Zheming Yuan, Xuxin Cheng, Cao Liu, Ke Zeng, Fengying Xie, Zhiguo Jiang, and Haopeng Zhang. 2026. Global Context or Local Detail? Adaptive Visual Grounding for Hallucination Mitigation. In Findings of the Association for Computational Linguistics: ACL 2026, pages 15133–15152, San Diego, California, United States. Association for Computational Linguistics.