@inproceedings{wen-etal-2026-switching,
title = "Switching Heads and Softening Tokens: Turnkey Solutions to Visually Grounded Document {QA}",
author = "Wen, Ximing and
Li, Wenbo and
Paul, Sudipta and
Saidutta, Yashas Malur and
Gunaratna, Kalpa and
Chappidi, Srinivas",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1818/",
pages = "36490--36503",
ISBN = "979-8-89176-395-1",
abstract = "Visually Grounded Document Question Answering often lacks robust, end-to-end solutions capable of handling complex, multi-answer queries without reliance on ad-hoc processing. In this work, we propose two turnkey LLM architectures to address this gap. We first introduce a single-head architecture where coordinates are represented as special tokens within the unified vocabulary. While structurally robust, this approach suffers from the limitations of discrete supervision; to address this, we propose a novel ``softening token'' method that enables differentiable Mean-Squared-Error loss over token probabilities. Although this significantly improves visual grounding, the spatial precision remains bound by discretization. Consequently, we propose a second solution: a dual-head architecture that alternates between text generation and regression-based bounding box prediction. This method offers high spatial precision via a regression head, further stabilized by our introduction of an Intersection-over-Union loss. Finally, by combining the single head model{'}s structural robustness with the high precision of the dual head model, we propose an ensemble method that yields significant performance gains beyond each of individual components."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wen-etal-2026-switching">
<titleInfo>
<title>Switching Heads and Softening Tokens: Turnkey Solutions to Visually Grounded Document QA</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ximing</namePart>
<namePart type="family">Wen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenbo</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sudipta</namePart>
<namePart type="family">Paul</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yashas</namePart>
<namePart type="given">Malur</namePart>
<namePart type="family">Saidutta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalpa</namePart>
<namePart type="family">Gunaratna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Srinivas</namePart>
<namePart type="family">Chappidi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Visually Grounded Document Question Answering often lacks robust, end-to-end solutions capable of handling complex, multi-answer queries without reliance on ad-hoc processing. In this work, we propose two turnkey LLM architectures to address this gap. We first introduce a single-head architecture where coordinates are represented as special tokens within the unified vocabulary. While structurally robust, this approach suffers from the limitations of discrete supervision; to address this, we propose a novel “softening token” method that enables differentiable Mean-Squared-Error loss over token probabilities. Although this significantly improves visual grounding, the spatial precision remains bound by discretization. Consequently, we propose a second solution: a dual-head architecture that alternates between text generation and regression-based bounding box prediction. This method offers high spatial precision via a regression head, further stabilized by our introduction of an Intersection-over-Union loss. Finally, by combining the single head model’s structural robustness with the high precision of the dual head model, we propose an ensemble method that yields significant performance gains beyond each of individual components.</abstract>
<identifier type="citekey">wen-etal-2026-switching</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1818/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>36490</start>
<end>36503</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Switching Heads and Softening Tokens: Turnkey Solutions to Visually Grounded Document QA
%A Wen, Ximing
%A Li, Wenbo
%A Paul, Sudipta
%A Saidutta, Yashas Malur
%A Gunaratna, Kalpa
%A Chappidi, Srinivas
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F wen-etal-2026-switching
%X Visually Grounded Document Question Answering often lacks robust, end-to-end solutions capable of handling complex, multi-answer queries without reliance on ad-hoc processing. In this work, we propose two turnkey LLM architectures to address this gap. We first introduce a single-head architecture where coordinates are represented as special tokens within the unified vocabulary. While structurally robust, this approach suffers from the limitations of discrete supervision; to address this, we propose a novel “softening token” method that enables differentiable Mean-Squared-Error loss over token probabilities. Although this significantly improves visual grounding, the spatial precision remains bound by discretization. Consequently, we propose a second solution: a dual-head architecture that alternates between text generation and regression-based bounding box prediction. This method offers high spatial precision via a regression head, further stabilized by our introduction of an Intersection-over-Union loss. Finally, by combining the single head model’s structural robustness with the high precision of the dual head model, we propose an ensemble method that yields significant performance gains beyond each of individual components.
%U https://aclanthology.org/2026.findings-acl.1818/
%P 36490-36503
Markdown (Informal)
[Switching Heads and Softening Tokens: Turnkey Solutions to Visually Grounded Document QA](https://aclanthology.org/2026.findings-acl.1818/) (Wen et al., Findings 2026)
ACL