@inproceedings{schaefer-neudecker-2020-two,
title = "A Two-Step Approach for Automatic {OCR} Post-Correction",
author = "Schaefer, Robin and
Neudecker, Clemens",
editor = "DeGaetano, Stefania and
Kazantseva, Anna and
Reiter, Nils and
Szpakowicz, Stan",
booktitle = "Proceedings of the 4th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature",
month = dec,
year = "2020",
address = "Online",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.latechclfl-1.6",
pages = "52--57",
abstract = "The quality of Optical Character Recognition (OCR) is a key factor in the digitisation of historical documents. OCR errors are a major obstacle for downstream tasks and have hindered advances in the usage of the digitised documents. In this paper we present a two-step approach to automatic OCR post-correction. The first component is responsible for detecting erroneous sequences in a set of OCRed texts, while the second is designed for correcting OCR errors in them. We show that applying the preceding detection model reduces both the character error rate (CER) compared to a simple one-step correction model and the amount of falsely changed correct characters.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="schaefer-neudecker-2020-two">
<titleInfo>
<title>A Two-Step Approach for Automatic OCR Post-Correction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Robin</namePart>
<namePart type="family">Schaefer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clemens</namePart>
<namePart type="family">Neudecker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature</title>
</titleInfo>
<name type="personal">
<namePart type="given">Stefania</namePart>
<namePart type="family">DeGaetano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Kazantseva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nils</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stan</namePart>
<namePart type="family">Szpakowicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The quality of Optical Character Recognition (OCR) is a key factor in the digitisation of historical documents. OCR errors are a major obstacle for downstream tasks and have hindered advances in the usage of the digitised documents. In this paper we present a two-step approach to automatic OCR post-correction. The first component is responsible for detecting erroneous sequences in a set of OCRed texts, while the second is designed for correcting OCR errors in them. We show that applying the preceding detection model reduces both the character error rate (CER) compared to a simple one-step correction model and the amount of falsely changed correct characters.</abstract>
<identifier type="citekey">schaefer-neudecker-2020-two</identifier>
<location>
<url>https://aclanthology.org/2020.latechclfl-1.6</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>52</start>
<end>57</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Two-Step Approach for Automatic OCR Post-Correction
%A Schaefer, Robin
%A Neudecker, Clemens
%Y DeGaetano, Stefania
%Y Kazantseva, Anna
%Y Reiter, Nils
%Y Szpakowicz, Stan
%S Proceedings of the 4th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Online
%F schaefer-neudecker-2020-two
%X The quality of Optical Character Recognition (OCR) is a key factor in the digitisation of historical documents. OCR errors are a major obstacle for downstream tasks and have hindered advances in the usage of the digitised documents. In this paper we present a two-step approach to automatic OCR post-correction. The first component is responsible for detecting erroneous sequences in a set of OCRed texts, while the second is designed for correcting OCR errors in them. We show that applying the preceding detection model reduces both the character error rate (CER) compared to a simple one-step correction model and the amount of falsely changed correct characters.
%U https://aclanthology.org/2020.latechclfl-1.6
%P 52-57
Markdown (Informal)
[A Two-Step Approach for Automatic OCR Post-Correction](https://aclanthology.org/2020.latechclfl-1.6) (Schaefer & Neudecker, LaTeCHCLfL 2020)
ACL
- Robin Schaefer and Clemens Neudecker. 2020. A Two-Step Approach for Automatic OCR Post-Correction. In Proceedings of the 4th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, pages 52–57, Online. International Committee on Computational Linguistics.