@inproceedings{hubert-etal-2016-training,
title = "Training {\&} Quality Assessment of an Optical Character Recognition Model for {N}orthern {H}aida",
author = "Hubert, Isabell and
Arppe, Antti and
Lachler, Jordan and
Santos, Eddie A.",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Goggi, Sara and
Grobelnik, Marko and
Maegaard, Bente and
Mariani, Joseph and
Mazo, Helene and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1514",
pages = "3227--3234",
abstract = "We are presenting our work on the creation of the first optical character recognition (OCR) model for Northern Haida, also known as Masset or Xaad Kil, a nearly extinct First Nations language spoken in the Haida Gwaii archipelago in British Columbia, Canada. We are addressing the challenges of training an OCR model for a language with an extensive, non-standard Latin character set as follows: (1) We have compared various training approaches and present the results of practical analyses to maximize recognition accuracy and minimize manual labor. An approach using just one or two pages of Source Images directly performed better than the Image Generation approach, and better than models based on three or more pages. Analyses also suggest that a character{'}s frequency is directly correlated with its recognition accuracy. (2) We present an overview of current OCR accuracy analysis tools available. (3) We have ported the once de-facto standardized OCR accuracy tools to be able to cope with Unicode input. Our work adds to a growing body of research on OCR for particularly challenging character sets, and contributes to creating the largest electronic corpus for this severely endangered language.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hubert-etal-2016-training">
<titleInfo>
<title>Training & Quality Assessment of an Optical Character Recognition Model for Northern Haida</title>
</titleInfo>
<name type="personal">
<namePart type="given">Isabell</namePart>
<namePart type="family">Hubert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antti</namePart>
<namePart type="family">Arppe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Lachler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eddie</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Grobelnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helene</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Portorož, Slovenia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We are presenting our work on the creation of the first optical character recognition (OCR) model for Northern Haida, also known as Masset or Xaad Kil, a nearly extinct First Nations language spoken in the Haida Gwaii archipelago in British Columbia, Canada. We are addressing the challenges of training an OCR model for a language with an extensive, non-standard Latin character set as follows: (1) We have compared various training approaches and present the results of practical analyses to maximize recognition accuracy and minimize manual labor. An approach using just one or two pages of Source Images directly performed better than the Image Generation approach, and better than models based on three or more pages. Analyses also suggest that a character’s frequency is directly correlated with its recognition accuracy. (2) We present an overview of current OCR accuracy analysis tools available. (3) We have ported the once de-facto standardized OCR accuracy tools to be able to cope with Unicode input. Our work adds to a growing body of research on OCR for particularly challenging character sets, and contributes to creating the largest electronic corpus for this severely endangered language.</abstract>
<identifier type="citekey">hubert-etal-2016-training</identifier>
<location>
<url>https://aclanthology.org/L16-1514</url>
</location>
<part>
<date>2016-05</date>
<extent unit="page">
<start>3227</start>
<end>3234</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Training & Quality Assessment of an Optical Character Recognition Model for Northern Haida
%A Hubert, Isabell
%A Arppe, Antti
%A Lachler, Jordan
%A Santos, Eddie A.
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Grobelnik, Marko
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Helene
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)
%D 2016
%8 May
%I European Language Resources Association (ELRA)
%C Portorož, Slovenia
%F hubert-etal-2016-training
%X We are presenting our work on the creation of the first optical character recognition (OCR) model for Northern Haida, also known as Masset or Xaad Kil, a nearly extinct First Nations language spoken in the Haida Gwaii archipelago in British Columbia, Canada. We are addressing the challenges of training an OCR model for a language with an extensive, non-standard Latin character set as follows: (1) We have compared various training approaches and present the results of practical analyses to maximize recognition accuracy and minimize manual labor. An approach using just one or two pages of Source Images directly performed better than the Image Generation approach, and better than models based on three or more pages. Analyses also suggest that a character’s frequency is directly correlated with its recognition accuracy. (2) We present an overview of current OCR accuracy analysis tools available. (3) We have ported the once de-facto standardized OCR accuracy tools to be able to cope with Unicode input. Our work adds to a growing body of research on OCR for particularly challenging character sets, and contributes to creating the largest electronic corpus for this severely endangered language.
%U https://aclanthology.org/L16-1514
%P 3227-3234
Markdown (Informal)
[Training & Quality Assessment of an Optical Character Recognition Model for Northern Haida](https://aclanthology.org/L16-1514) (Hubert et al., LREC 2016)
ACL