@inproceedings{zhang-etal-2025-chaotic,
title = "From Chaotic {OCR} Words to Coherent Document: A Fine-to-Coarse Zoom-Out Network for Complex-Layout Document Image Translation",
author = "Zhang, Zhiyang and
Zhang, Yaping and
Liang, Yupu and
Xiang, Lu and
Zhao, Yang and
Zhou, Yu and
Zong, Chengqing",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.723/",
pages = "10877--10890",
abstract = "Document Image Translation (DIT) aims to translate documents in images from one language to another. It requires visual layouts and textual contents understanding, as well as document coherence capturing. However, current methods often rely on the quality of OCR output, which, particularly in complex-layout scenarios, frequently loses the crucial document coherence, leading to chaotic text. To overcome this problem, we introduce a novel end-to-end network, named Zoom-out DIT (ZoomDIT), inspired by human translation procedures. It jointly accomplishes the multi-level tasks including word positioning, sentence recognition {\&} translation, and document organization, based on a fine-to-coarse zoom-out framework, to progressively realize {\textquotedblleft}chaotic words to coherent document{\textquotedblright} and improve translation. We further contribute a new large-scale DIT dataset with multi-level fine-grained labels. Extensive experiments on public and our new dataset demonstrate significant improvements in translation quality towards complex-layout document images, offering a robust solution for reorganizing the chaotic OCR outputs to a coherent document translation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2025-chaotic">
<titleInfo>
<title>From Chaotic OCR Words to Coherent Document: A Fine-to-Coarse Zoom-Out Network for Complex-Layout Document Image Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhiyang</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaping</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yupu</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Xiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Document Image Translation (DIT) aims to translate documents in images from one language to another. It requires visual layouts and textual contents understanding, as well as document coherence capturing. However, current methods often rely on the quality of OCR output, which, particularly in complex-layout scenarios, frequently loses the crucial document coherence, leading to chaotic text. To overcome this problem, we introduce a novel end-to-end network, named Zoom-out DIT (ZoomDIT), inspired by human translation procedures. It jointly accomplishes the multi-level tasks including word positioning, sentence recognition & translation, and document organization, based on a fine-to-coarse zoom-out framework, to progressively realize “chaotic words to coherent document” and improve translation. We further contribute a new large-scale DIT dataset with multi-level fine-grained labels. Extensive experiments on public and our new dataset demonstrate significant improvements in translation quality towards complex-layout document images, offering a robust solution for reorganizing the chaotic OCR outputs to a coherent document translation.</abstract>
<identifier type="citekey">zhang-etal-2025-chaotic</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.723/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>10877</start>
<end>10890</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T From Chaotic OCR Words to Coherent Document: A Fine-to-Coarse Zoom-Out Network for Complex-Layout Document Image Translation
%A Zhang, Zhiyang
%A Zhang, Yaping
%A Liang, Yupu
%A Xiang, Lu
%A Zhao, Yang
%A Zhou, Yu
%A Zong, Chengqing
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F zhang-etal-2025-chaotic
%X Document Image Translation (DIT) aims to translate documents in images from one language to another. It requires visual layouts and textual contents understanding, as well as document coherence capturing. However, current methods often rely on the quality of OCR output, which, particularly in complex-layout scenarios, frequently loses the crucial document coherence, leading to chaotic text. To overcome this problem, we introduce a novel end-to-end network, named Zoom-out DIT (ZoomDIT), inspired by human translation procedures. It jointly accomplishes the multi-level tasks including word positioning, sentence recognition & translation, and document organization, based on a fine-to-coarse zoom-out framework, to progressively realize “chaotic words to coherent document” and improve translation. We further contribute a new large-scale DIT dataset with multi-level fine-grained labels. Extensive experiments on public and our new dataset demonstrate significant improvements in translation quality towards complex-layout document images, offering a robust solution for reorganizing the chaotic OCR outputs to a coherent document translation.
%U https://aclanthology.org/2025.coling-main.723/
%P 10877-10890
Markdown (Informal)
[From Chaotic OCR Words to Coherent Document: A Fine-to-Coarse Zoom-Out Network for Complex-Layout Document Image Translation](https://aclanthology.org/2025.coling-main.723/) (Zhang et al., COLING 2025)
ACL