@inproceedings{arrigo-etal-2022-camio,
title = "{CAMIO}: A Corpus for {OCR} in Multiple Languages",
author = "Arrigo, Michael and
Strassel, Stephanie and
King, Nolan and
Tran, Thao and
Mason, Lisa",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lrec-1.129",
pages = "1209--1216",
abstract = "CAMIO (Corpus of Annotated Multilingual Images for OCR) is a new corpus created by Linguistic Data Consortium to serve as a resource to support the development and evaluation of optical character recognition (OCR) and related technologies for 35 languages across 24 unique scripts. The corpus comprises nearly 70,000 images of machine printed text, covering a wide variety of topics and styles, document domains, attributes and scanning/capture artifacts. Most images have been exhaustively annotated for text localization, resulting in over 2.3M line-level bounding boxes. For 13 of the 35 languages, 1250 images/language have been further annotated with orthographic transcriptions of each line plus specification of reading order, yielding over 2.4M tokens of transcribed text. The resulting annotations are represented in a comprehensive XML output format defined for this corpus. The paper discusses corpus design and implementation, challenges encountered, baseline performance results obtained on the corpus for text localization and OCR decoding, and plans for corpus publication.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arrigo-etal-2022-camio">
<titleInfo>
<title>CAMIO: A Corpus for OCR in Multiple Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Arrigo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stephanie</namePart>
<namePart type="family">Strassel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nolan</namePart>
<namePart type="family">King</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thao</namePart>
<namePart type="family">Tran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lisa</namePart>
<namePart type="family">Mason</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Thirteenth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>CAMIO (Corpus of Annotated Multilingual Images for OCR) is a new corpus created by Linguistic Data Consortium to serve as a resource to support the development and evaluation of optical character recognition (OCR) and related technologies for 35 languages across 24 unique scripts. The corpus comprises nearly 70,000 images of machine printed text, covering a wide variety of topics and styles, document domains, attributes and scanning/capture artifacts. Most images have been exhaustively annotated for text localization, resulting in over 2.3M line-level bounding boxes. For 13 of the 35 languages, 1250 images/language have been further annotated with orthographic transcriptions of each line plus specification of reading order, yielding over 2.4M tokens of transcribed text. The resulting annotations are represented in a comprehensive XML output format defined for this corpus. The paper discusses corpus design and implementation, challenges encountered, baseline performance results obtained on the corpus for text localization and OCR decoding, and plans for corpus publication.</abstract>
<identifier type="citekey">arrigo-etal-2022-camio</identifier>
<location>
<url>https://aclanthology.org/2022.lrec-1.129</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>1209</start>
<end>1216</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CAMIO: A Corpus for OCR in Multiple Languages
%A Arrigo, Michael
%A Strassel, Stephanie
%A King, Nolan
%A Tran, Thao
%A Mason, Lisa
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Thirteenth Language Resources and Evaluation Conference
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F arrigo-etal-2022-camio
%X CAMIO (Corpus of Annotated Multilingual Images for OCR) is a new corpus created by Linguistic Data Consortium to serve as a resource to support the development and evaluation of optical character recognition (OCR) and related technologies for 35 languages across 24 unique scripts. The corpus comprises nearly 70,000 images of machine printed text, covering a wide variety of topics and styles, document domains, attributes and scanning/capture artifacts. Most images have been exhaustively annotated for text localization, resulting in over 2.3M line-level bounding boxes. For 13 of the 35 languages, 1250 images/language have been further annotated with orthographic transcriptions of each line plus specification of reading order, yielding over 2.4M tokens of transcribed text. The resulting annotations are represented in a comprehensive XML output format defined for this corpus. The paper discusses corpus design and implementation, challenges encountered, baseline performance results obtained on the corpus for text localization and OCR decoding, and plans for corpus publication.
%U https://aclanthology.org/2022.lrec-1.129
%P 1209-1216
Markdown (Informal)
[CAMIO: A Corpus for OCR in Multiple Languages](https://aclanthology.org/2022.lrec-1.129) (Arrigo et al., LREC 2022)
ACL
- Michael Arrigo, Stephanie Strassel, Nolan King, Thao Tran, and Lisa Mason. 2022. CAMIO: A Corpus for OCR in Multiple Languages. In Proceedings of the Thirteenth Language Resources and Evaluation Conference, pages 1209–1216, Marseille, France. European Language Resources Association.