@inproceedings{xing-etal-2025-intelligent,
title = "Intelligent Document Parsing: Towards End-to-end Document Parsing via Decoupled Content Parsing and Layout Grounding",
author = "Xing, Hangdi and
Gao, Feiyu and
Zheng, Qi and
Zhu, Zhaoqing and
Shao, Zirui and
Yan, Ming",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1088/",
pages = "19987--19998",
ISBN = "979-8-89176-335-7",
abstract = "In the daily work, vast amounts of documents are stored in pixel-based formats such as images and scanned PDFs, posing challenges for efficient database management and data processing. Existing methods often fragment the parsing process into the pipeline of separated subtasks on the layout element level, resulting in incomplete semantics and error propagation. Even though models based on multi-modal large language models (MLLMs) mitigate the issues to some extent, they also suffer from absent or sub-optimal grounding ability for visual information. To address these challenges, we introduce the Intelligent Document Parsing (IDP) framework, an end-to-end document parsing framework leveraging the vision-language priors of MLLMs, equipped with an elaborately designed document representation and decoding mechanism to decouple the content parsing and layout grounding to fully activate the potential of MLLMs for document parsing. Experimental results demonstrate that the IDP method surpasses existing methods, significantly advancing MLLM-based document parsing."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xing-etal-2025-intelligent">
<titleInfo>
<title>Intelligent Document Parsing: Towards End-to-end Document Parsing via Decoupled Content Parsing and Layout Grounding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hangdi</namePart>
<namePart type="family">Xing</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Feiyu</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qi</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhaoqing</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zirui</namePart>
<namePart type="family">Shao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>In the daily work, vast amounts of documents are stored in pixel-based formats such as images and scanned PDFs, posing challenges for efficient database management and data processing. Existing methods often fragment the parsing process into the pipeline of separated subtasks on the layout element level, resulting in incomplete semantics and error propagation. Even though models based on multi-modal large language models (MLLMs) mitigate the issues to some extent, they also suffer from absent or sub-optimal grounding ability for visual information. To address these challenges, we introduce the Intelligent Document Parsing (IDP) framework, an end-to-end document parsing framework leveraging the vision-language priors of MLLMs, equipped with an elaborately designed document representation and decoding mechanism to decouple the content parsing and layout grounding to fully activate the potential of MLLMs for document parsing. Experimental results demonstrate that the IDP method surpasses existing methods, significantly advancing MLLM-based document parsing.</abstract>
<identifier type="citekey">xing-etal-2025-intelligent</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1088/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>19987</start>
<end>19998</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Intelligent Document Parsing: Towards End-to-end Document Parsing via Decoupled Content Parsing and Layout Grounding
%A Xing, Hangdi
%A Gao, Feiyu
%A Zheng, Qi
%A Zhu, Zhaoqing
%A Shao, Zirui
%A Yan, Ming
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F xing-etal-2025-intelligent
%X In the daily work, vast amounts of documents are stored in pixel-based formats such as images and scanned PDFs, posing challenges for efficient database management and data processing. Existing methods often fragment the parsing process into the pipeline of separated subtasks on the layout element level, resulting in incomplete semantics and error propagation. Even though models based on multi-modal large language models (MLLMs) mitigate the issues to some extent, they also suffer from absent or sub-optimal grounding ability for visual information. To address these challenges, we introduce the Intelligent Document Parsing (IDP) framework, an end-to-end document parsing framework leveraging the vision-language priors of MLLMs, equipped with an elaborately designed document representation and decoding mechanism to decouple the content parsing and layout grounding to fully activate the potential of MLLMs for document parsing. Experimental results demonstrate that the IDP method surpasses existing methods, significantly advancing MLLM-based document parsing.
%U https://aclanthology.org/2025.findings-emnlp.1088/
%P 19987-19998
Markdown (Informal)
[Intelligent Document Parsing: Towards End-to-end Document Parsing via Decoupled Content Parsing and Layout Grounding](https://aclanthology.org/2025.findings-emnlp.1088/) (Xing et al., Findings 2025)
ACL