@inproceedings{kashid-bhattacharyya-2024-roundtripocr,
title = "{R}ound{T}rip{OCR}: A Data Generation Technique for Enhancing Post-{OCR} Error Correction in Low-Resource {D}evanagari Languages",
author = "Kashid, Harshvivek and
Bhattacharyya, Pushpak",
editor = "Lalitha Devi, Sobha and
Arora, Karunesh",
booktitle = "Proceedings of the 21st International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2024",
address = "AU-KBC Research Centre, Chennai, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2024.icon-1.33/",
pages = "274--284",
abstract = "Optical Character Recognition (OCR) technology has revolutionized the digitization of printed text, enabling efficient data extraction and analysis across various domains. Just like Machine Translation systems, OCR systems are prone to errors. In this work, we address the challenge of data generation and post-OCR error correction, specifically for low-resource languages. We propose an approach for synthetic data generation for Devanagari languages, RoundTripOCR, that tackles the scarcity of the post-OCR Error Correction datasets for low-resource languages. We release post-OCR text correction datasets for Hindi, Marathi, Bodo, Nepali, Konkani and Sanskrit. We also present a novel approach for OCR error correction by leveraging techniques from machine translation. Our method involves translating erroneous OCR output into a corrected form by treating the OCR errors as mistranslations in a parallel text corpus, employing pre-trained transformer models to learn the mapping from erroneous to correct text pairs, effectively correcting OCR errors."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kashid-bhattacharyya-2024-roundtripocr">
<titleInfo>
<title>RoundTripOCR: A Data Generation Technique for Enhancing Post-OCR Error Correction in Low-Resource Devanagari Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Harshvivek</namePart>
<namePart type="family">Kashid</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="family">Lalitha Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karunesh</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">AU-KBC Research Centre, Chennai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Optical Character Recognition (OCR) technology has revolutionized the digitization of printed text, enabling efficient data extraction and analysis across various domains. Just like Machine Translation systems, OCR systems are prone to errors. In this work, we address the challenge of data generation and post-OCR error correction, specifically for low-resource languages. We propose an approach for synthetic data generation for Devanagari languages, RoundTripOCR, that tackles the scarcity of the post-OCR Error Correction datasets for low-resource languages. We release post-OCR text correction datasets for Hindi, Marathi, Bodo, Nepali, Konkani and Sanskrit. We also present a novel approach for OCR error correction by leveraging techniques from machine translation. Our method involves translating erroneous OCR output into a corrected form by treating the OCR errors as mistranslations in a parallel text corpus, employing pre-trained transformer models to learn the mapping from erroneous to correct text pairs, effectively correcting OCR errors.</abstract>
<identifier type="citekey">kashid-bhattacharyya-2024-roundtripocr</identifier>
<location>
<url>https://aclanthology.org/2024.icon-1.33/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>274</start>
<end>284</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T RoundTripOCR: A Data Generation Technique for Enhancing Post-OCR Error Correction in Low-Resource Devanagari Languages
%A Kashid, Harshvivek
%A Bhattacharyya, Pushpak
%Y Lalitha Devi, Sobha
%Y Arora, Karunesh
%S Proceedings of the 21st International Conference on Natural Language Processing (ICON)
%D 2024
%8 December
%I NLP Association of India (NLPAI)
%C AU-KBC Research Centre, Chennai, India
%F kashid-bhattacharyya-2024-roundtripocr
%X Optical Character Recognition (OCR) technology has revolutionized the digitization of printed text, enabling efficient data extraction and analysis across various domains. Just like Machine Translation systems, OCR systems are prone to errors. In this work, we address the challenge of data generation and post-OCR error correction, specifically for low-resource languages. We propose an approach for synthetic data generation for Devanagari languages, RoundTripOCR, that tackles the scarcity of the post-OCR Error Correction datasets for low-resource languages. We release post-OCR text correction datasets for Hindi, Marathi, Bodo, Nepali, Konkani and Sanskrit. We also present a novel approach for OCR error correction by leveraging techniques from machine translation. Our method involves translating erroneous OCR output into a corrected form by treating the OCR errors as mistranslations in a parallel text corpus, employing pre-trained transformer models to learn the mapping from erroneous to correct text pairs, effectively correcting OCR errors.
%U https://aclanthology.org/2024.icon-1.33/
%P 274-284
Markdown (Informal)
[RoundTripOCR: A Data Generation Technique for Enhancing Post-OCR Error Correction in Low-Resource Devanagari Languages](https://aclanthology.org/2024.icon-1.33/) (Kashid & Bhattacharyya, ICON 2024)
ACL