@inproceedings{zheng-etal-2026-doc,
title = "Doc-$V^*$: Coarse-to-Fine Interactive Visual Reasoning for Multi-Page Document {VQA}",
author = "Zheng, Yuanlei and
Fu, Pei and
Li, Hang and
Wang, Ziyang and
Zhang, Yuyi and
Ruan, Wenyu and
Zhang, Xiaojin and
Wei, Zhongyu and
Luo, Zhenbo and
Luan, Jian and
Chen, Wei and
Bai, Xiang",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.2129/",
pages = "45901--45923",
ISBN = "979-8-89176-390-6",
abstract = "Multi-page Document Visual Question Answering requires reasoning over semantics, layouts, and visual elements in long, visually dense documents. Existing OCR-free methods face a trade-off between capacity and precision: end-to-end models scale poorly with document length, while visual retrieval-based pipelines are brittle and passive. We propose \textbf{Doc-$V^*$}, an \textbf{OCR-free agentic} framework that casts multi-page DocVQA as sequential evidence aggregation. \textbf{Doc-$V^*$} begins with a thumbnail overview, then actively navigates via semantic retrieval and targeted page fetching, and aggregates evidence in a structured working memory for grounded reasoning. Trained by imitation learning from expert trajectories and further optimized with Group Relative Policy Optimization, \textbf{Doc-$V^*$} balances answer accuracy with evidence-seeking efficiency. Across five benchmarks, \textbf{Doc-$V^*$} outperforms open-source baselines and approaches proprietary models, improving out-of-domain performance by up to \textbf{47.9{\%}} over RAG baseline. Other results reveal effective evidence aggregation with selective attention, not increased input pages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zheng-etal-2026-doc">
<titleInfo>
<title>Doc-V^*: Coarse-to-Fine Interactive Visual Reasoning for Multi-Page Document VQA</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuanlei</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pei</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziyang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuyi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenyu</namePart>
<namePart type="family">Ruan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaojin</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhongyu</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhenbo</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Luan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Bai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Multi-page Document Visual Question Answering requires reasoning over semantics, layouts, and visual elements in long, visually dense documents. Existing OCR-free methods face a trade-off between capacity and precision: end-to-end models scale poorly with document length, while visual retrieval-based pipelines are brittle and passive. We propose Doc-V^*, an OCR-free agentic framework that casts multi-page DocVQA as sequential evidence aggregation. Doc-V^* begins with a thumbnail overview, then actively navigates via semantic retrieval and targeted page fetching, and aggregates evidence in a structured working memory for grounded reasoning. Trained by imitation learning from expert trajectories and further optimized with Group Relative Policy Optimization, Doc-V^* balances answer accuracy with evidence-seeking efficiency. Across five benchmarks, Doc-V^* outperforms open-source baselines and approaches proprietary models, improving out-of-domain performance by up to 47.9% over RAG baseline. Other results reveal effective evidence aggregation with selective attention, not increased input pages.</abstract>
<identifier type="citekey">zheng-etal-2026-doc</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.2129/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>45901</start>
<end>45923</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Doc-V^*: Coarse-to-Fine Interactive Visual Reasoning for Multi-Page Document VQA
%A Zheng, Yuanlei
%A Fu, Pei
%A Li, Hang
%A Wang, Ziyang
%A Zhang, Yuyi
%A Ruan, Wenyu
%A Zhang, Xiaojin
%A Wei, Zhongyu
%A Luo, Zhenbo
%A Luan, Jian
%A Chen, Wei
%A Bai, Xiang
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F zheng-etal-2026-doc
%X Multi-page Document Visual Question Answering requires reasoning over semantics, layouts, and visual elements in long, visually dense documents. Existing OCR-free methods face a trade-off between capacity and precision: end-to-end models scale poorly with document length, while visual retrieval-based pipelines are brittle and passive. We propose Doc-V^*, an OCR-free agentic framework that casts multi-page DocVQA as sequential evidence aggregation. Doc-V^* begins with a thumbnail overview, then actively navigates via semantic retrieval and targeted page fetching, and aggregates evidence in a structured working memory for grounded reasoning. Trained by imitation learning from expert trajectories and further optimized with Group Relative Policy Optimization, Doc-V^* balances answer accuracy with evidence-seeking efficiency. Across five benchmarks, Doc-V^* outperforms open-source baselines and approaches proprietary models, improving out-of-domain performance by up to 47.9% over RAG baseline. Other results reveal effective evidence aggregation with selective attention, not increased input pages.
%U https://aclanthology.org/2026.acl-long.2129/
%P 45901-45923
Markdown (Informal)
[Doc-V*: Coarse-to-Fine Interactive Visual Reasoning for Multi-Page Document VQA](https://aclanthology.org/2026.acl-long.2129/) (Zheng et al., ACL 2026)
ACL
- Yuanlei Zheng, Pei Fu, Hang Li, Ziyang Wang, Yuyi Zhang, Wenyu Ruan, Xiaojin Zhang, Zhongyu Wei, Zhenbo Luo, Jian Luan, Wei Chen, and Xiang Bai. 2026. Doc-V*: Coarse-to-Fine Interactive Visual Reasoning for Multi-Page Document VQA. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 45901–45923, San Diego, California, United States. Association for Computational Linguistics.