@inproceedings{shim-etal-2025-revise,
title = "{REVISE}: A Framework for Revising {OCR}ed text in Practical Information Systems with Data Contamination Strategy",
author = "Shim, Gyuho and
Hong, Seongtae and
Lim, Heuiseok",
editor = "Rehm, Georg and
Li, Yunyao",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-industry.100/",
doi = "10.18653/v1/2025.acl-industry.100",
pages = "1423--1434",
ISBN = "979-8-89176-288-6",
abstract = "Recent advances in large language models (LLMs) have significantly improved Document AI, demonstrating remarkable performance on document understanding tasks such as question answering. However, existing approaches primarily focus on solving specific tasks, lacking the capability to structurally organize and systematically manage document information. To address this limitation, we propose Revise, a framework that systematically corrects errors introduced by OCR at the character, word, and structural levels. Specifically, Revise employs a comprehensive hierarchical taxonomy of common OCR errors and a synthetic data generation strategy that realistically simulates such errors to train an effective correction model. Experimental results demonstrate that Revise effectively corrects OCR outputs, enabling more structured representation and systematic management of document contents. Consequently, our method significantly enhances downstream performance in document retrieval and question answering tasks, highlighting the potential to overcome the structural management limitations of existing Document AI frameworks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shim-etal-2025-revise">
<titleInfo>
<title>REVISE: A Framework for Revising OCRed text in Practical Information Systems with Data Contamination Strategy</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gyuho</namePart>
<namePart type="family">Shim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seongtae</namePart>
<namePart type="family">Hong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heuiseok</namePart>
<namePart type="family">Lim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Rehm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-288-6</identifier>
</relatedItem>
<abstract>Recent advances in large language models (LLMs) have significantly improved Document AI, demonstrating remarkable performance on document understanding tasks such as question answering. However, existing approaches primarily focus on solving specific tasks, lacking the capability to structurally organize and systematically manage document information. To address this limitation, we propose Revise, a framework that systematically corrects errors introduced by OCR at the character, word, and structural levels. Specifically, Revise employs a comprehensive hierarchical taxonomy of common OCR errors and a synthetic data generation strategy that realistically simulates such errors to train an effective correction model. Experimental results demonstrate that Revise effectively corrects OCR outputs, enabling more structured representation and systematic management of document contents. Consequently, our method significantly enhances downstream performance in document retrieval and question answering tasks, highlighting the potential to overcome the structural management limitations of existing Document AI frameworks.</abstract>
<identifier type="citekey">shim-etal-2025-revise</identifier>
<identifier type="doi">10.18653/v1/2025.acl-industry.100</identifier>
<location>
<url>https://aclanthology.org/2025.acl-industry.100/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>1423</start>
<end>1434</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T REVISE: A Framework for Revising OCRed text in Practical Information Systems with Data Contamination Strategy
%A Shim, Gyuho
%A Hong, Seongtae
%A Lim, Heuiseok
%Y Rehm, Georg
%Y Li, Yunyao
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-288-6
%F shim-etal-2025-revise
%X Recent advances in large language models (LLMs) have significantly improved Document AI, demonstrating remarkable performance on document understanding tasks such as question answering. However, existing approaches primarily focus on solving specific tasks, lacking the capability to structurally organize and systematically manage document information. To address this limitation, we propose Revise, a framework that systematically corrects errors introduced by OCR at the character, word, and structural levels. Specifically, Revise employs a comprehensive hierarchical taxonomy of common OCR errors and a synthetic data generation strategy that realistically simulates such errors to train an effective correction model. Experimental results demonstrate that Revise effectively corrects OCR outputs, enabling more structured representation and systematic management of document contents. Consequently, our method significantly enhances downstream performance in document retrieval and question answering tasks, highlighting the potential to overcome the structural management limitations of existing Document AI frameworks.
%R 10.18653/v1/2025.acl-industry.100
%U https://aclanthology.org/2025.acl-industry.100/
%U https://doi.org/10.18653/v1/2025.acl-industry.100
%P 1423-1434
Markdown (Informal)
[REVISE: A Framework for Revising OCRed text in Practical Information Systems with Data Contamination Strategy](https://aclanthology.org/2025.acl-industry.100/) (Shim et al., ACL 2025)
ACL