@inproceedings{afli-way-2016-integrating,
title = "Integrating Optical Character Recognition and Machine Translation of Historical Documents",
author = "Afli, Haithem and
Way, Andy",
editor = "Hinrichs, Erhard and
Hinrichs, Marie and
Trippel, Thorsten",
booktitle = "Proceedings of the Workshop on Language Technology Resources and Tools for Digital Humanities ({LT}4{DH})",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://aclanthology.org/W16-4015",
pages = "109--116",
abstract = "Machine Translation (MT) plays a critical role in expanding capacity in the translation industry. However, many valuable documents, including digital documents, are encoded in non-accessible formats for machine processing (e.g., Historical or Legal documents). Such documents must be passed through a process of Optical Character Recognition (OCR) to render the text suitable for MT. No matter how good the OCR is, this process introduces recognition errors, which often renders MT ineffective. In this paper, we propose a new OCR to MT framework based on adding a new OCR error correction module to enhance the overall quality of translation. Experimentation shows that our new system correction based on the combination of Language Modeling and Translation methods outperforms the baseline system by nearly 30{\%} relative improvement.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="afli-way-2016-integrating">
<titleInfo>
<title>Integrating Optical Character Recognition and Machine Translation of Historical Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haithem</namePart>
<namePart type="family">Afli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andy</namePart>
<namePart type="family">Way</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Language Technology Resources and Tools for Digital Humanities (LT4DH)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Erhard</namePart>
<namePart type="family">Hinrichs</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Hinrichs</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thorsten</namePart>
<namePart type="family">Trippel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The COLING 2016 Organizing Committee</publisher>
<place>
<placeTerm type="text">Osaka, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Machine Translation (MT) plays a critical role in expanding capacity in the translation industry. However, many valuable documents, including digital documents, are encoded in non-accessible formats for machine processing (e.g., Historical or Legal documents). Such documents must be passed through a process of Optical Character Recognition (OCR) to render the text suitable for MT. No matter how good the OCR is, this process introduces recognition errors, which often renders MT ineffective. In this paper, we propose a new OCR to MT framework based on adding a new OCR error correction module to enhance the overall quality of translation. Experimentation shows that our new system correction based on the combination of Language Modeling and Translation methods outperforms the baseline system by nearly 30% relative improvement.</abstract>
<identifier type="citekey">afli-way-2016-integrating</identifier>
<location>
<url>https://aclanthology.org/W16-4015</url>
</location>
<part>
<date>2016-12</date>
<extent unit="page">
<start>109</start>
<end>116</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Integrating Optical Character Recognition and Machine Translation of Historical Documents
%A Afli, Haithem
%A Way, Andy
%Y Hinrichs, Erhard
%Y Hinrichs, Marie
%Y Trippel, Thorsten
%S Proceedings of the Workshop on Language Technology Resources and Tools for Digital Humanities (LT4DH)
%D 2016
%8 December
%I The COLING 2016 Organizing Committee
%C Osaka, Japan
%F afli-way-2016-integrating
%X Machine Translation (MT) plays a critical role in expanding capacity in the translation industry. However, many valuable documents, including digital documents, are encoded in non-accessible formats for machine processing (e.g., Historical or Legal documents). Such documents must be passed through a process of Optical Character Recognition (OCR) to render the text suitable for MT. No matter how good the OCR is, this process introduces recognition errors, which often renders MT ineffective. In this paper, we propose a new OCR to MT framework based on adding a new OCR error correction module to enhance the overall quality of translation. Experimentation shows that our new system correction based on the combination of Language Modeling and Translation methods outperforms the baseline system by nearly 30% relative improvement.
%U https://aclanthology.org/W16-4015
%P 109-116
Markdown (Informal)
[Integrating Optical Character Recognition and Machine Translation of Historical Documents](https://aclanthology.org/W16-4015) (Afli & Way, LT4DH 2016)
ACL