@inproceedings{mcnamee-etal-2025-oj4ocrmt,
title = "{OJ}4{OCRMT}: A Large Multilingual Dataset for {OCR}-{MT} Evaluation",
author = "McNamee, Paul and
Duh, Kevin and
Carpenter, Cameron and
Colaianni, Ron and
King, Nolan and
Murray, Kenton",
editor = "Bouillon, Pierrette and
Gerlach, Johanna and
Girletti, Sabrina and
Volkart, Lise and
Rubino, Raphael and
Sennrich, Rico and
Farinha, Ana C. and
Gaido, Marco and
Daems, Joke and
Kenny, Dorothy and
Moniz, Helena and
Szoc, Sara",
booktitle = "Proceedings of Machine Translation Summit XX: Volume 1",
month = jun,
year = "2025",
address = "Geneva, Switzerland",
publisher = "European Association for Machine Translation",
url = "https://aclanthology.org/2025.mtsummit-1.9/",
pages = "113--125",
ISBN = "978-2-9701897-0-1",
abstract = "We introduce OJ4OCRMT, an Optical Character Recognition (OCR) dataset for Machine Translation (MT). The dataset supports research on automatic extraction, recognition, and translation of text from document images. The Official Journal of the European Union (OJEU), is the official gazette for the EU. Tens of thousands of pages of legislative acts and regulatory notices are published annually, and parallel translations are available in each of the official languages. Due to its large size, high degree of multilinguality, and carefully produced human translations, the OJEU is a singular resource for language processing research. We have assembled a large collection of parallel pages from the OJEU and have created a dataset to support translation of document images. In this work we introduce the dataset, describe the design decisions which we undertook, and report baseline performance figures for the translation task. It is our hope that this dataset will significantly add to the comparatively few resources presently available for evaluating OCR-MT systems."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mcnamee-etal-2025-oj4ocrmt">
<titleInfo>
<title>OJ4OCRMT: A Large Multilingual Dataset for OCR-MT Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">McNamee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cameron</namePart>
<namePart type="family">Carpenter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ron</namePart>
<namePart type="family">Colaianni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nolan</namePart>
<namePart type="family">King</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenton</namePart>
<namePart type="family">Murray</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Machine Translation Summit XX: Volume 1</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pierrette</namePart>
<namePart type="family">Bouillon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johanna</namePart>
<namePart type="family">Gerlach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sabrina</namePart>
<namePart type="family">Girletti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lise</namePart>
<namePart type="family">Volkart</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raphael</namePart>
<namePart type="family">Rubino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rico</namePart>
<namePart type="family">Sennrich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ana</namePart>
<namePart type="given">C</namePart>
<namePart type="family">Farinha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Gaido</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joke</namePart>
<namePart type="family">Daems</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dorothy</namePart>
<namePart type="family">Kenny</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Moniz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Szoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Association for Machine Translation</publisher>
<place>
<placeTerm type="text">Geneva, Switzerland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-2-9701897-0-1</identifier>
</relatedItem>
<abstract>We introduce OJ4OCRMT, an Optical Character Recognition (OCR) dataset for Machine Translation (MT). The dataset supports research on automatic extraction, recognition, and translation of text from document images. The Official Journal of the European Union (OJEU), is the official gazette for the EU. Tens of thousands of pages of legislative acts and regulatory notices are published annually, and parallel translations are available in each of the official languages. Due to its large size, high degree of multilinguality, and carefully produced human translations, the OJEU is a singular resource for language processing research. We have assembled a large collection of parallel pages from the OJEU and have created a dataset to support translation of document images. In this work we introduce the dataset, describe the design decisions which we undertook, and report baseline performance figures for the translation task. It is our hope that this dataset will significantly add to the comparatively few resources presently available for evaluating OCR-MT systems.</abstract>
<identifier type="citekey">mcnamee-etal-2025-oj4ocrmt</identifier>
<location>
<url>https://aclanthology.org/2025.mtsummit-1.9/</url>
</location>
<part>
<date>2025-06</date>
<extent unit="page">
<start>113</start>
<end>125</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T OJ4OCRMT: A Large Multilingual Dataset for OCR-MT Evaluation
%A McNamee, Paul
%A Duh, Kevin
%A Carpenter, Cameron
%A Colaianni, Ron
%A King, Nolan
%A Murray, Kenton
%Y Bouillon, Pierrette
%Y Gerlach, Johanna
%Y Girletti, Sabrina
%Y Volkart, Lise
%Y Rubino, Raphael
%Y Sennrich, Rico
%Y Farinha, Ana C.
%Y Gaido, Marco
%Y Daems, Joke
%Y Kenny, Dorothy
%Y Moniz, Helena
%Y Szoc, Sara
%S Proceedings of Machine Translation Summit XX: Volume 1
%D 2025
%8 June
%I European Association for Machine Translation
%C Geneva, Switzerland
%@ 978-2-9701897-0-1
%F mcnamee-etal-2025-oj4ocrmt
%X We introduce OJ4OCRMT, an Optical Character Recognition (OCR) dataset for Machine Translation (MT). The dataset supports research on automatic extraction, recognition, and translation of text from document images. The Official Journal of the European Union (OJEU), is the official gazette for the EU. Tens of thousands of pages of legislative acts and regulatory notices are published annually, and parallel translations are available in each of the official languages. Due to its large size, high degree of multilinguality, and carefully produced human translations, the OJEU is a singular resource for language processing research. We have assembled a large collection of parallel pages from the OJEU and have created a dataset to support translation of document images. In this work we introduce the dataset, describe the design decisions which we undertook, and report baseline performance figures for the translation task. It is our hope that this dataset will significantly add to the comparatively few resources presently available for evaluating OCR-MT systems.
%U https://aclanthology.org/2025.mtsummit-1.9/
%P 113-125
Markdown (Informal)
[OJ4OCRMT: A Large Multilingual Dataset for OCR-MT Evaluation](https://aclanthology.org/2025.mtsummit-1.9/) (McNamee et al., MTSummit 2025)
ACL