@inproceedings{dinarelli-rosset-2012-tree-structured,
title = "Tree-Structured Named Entity Recognition on {OCR} Data: Analysis, Processing and Results",
author = "Dinarelli, Marco and
Rosset, Sophie",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Do{\u{g}}an, Mehmet U{\u{g}}ur and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)",
month = may,
year = "2012",
address = "Istanbul, Turkey",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/1046_Paper.pdf",
pages = "1266--1272",
abstract = "In this paper we deal with named entity detection on data acquired via OCR process on documents dating from 1890. The resulting corpus is very noisy. We perform an analysis to find possible strategies to overcome errors introduced by the OCR process. We propose a preprocessing procedure in three steps to clean data and correct, at least in part, OCR mistakes. The task is made even harder by the complex tree-structure of named entities annotated on data, we solve this problem however by adopting an effective named entity detection system we proposed in previous work. We evaluate our procedure for preprocessing OCR-ized data in two ways: in terms of perplexity and OOV rate of a language model on development and evaluation data, and in terms of the performance of the named entity detection system on the preprocessed data. The preprocessing procedure results to be effective, allowing to improve by a large margin the system we proposed for the official evaluation campaign on Old Press, and allowing to outperform also the best performing system of the evaluation campaign.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dinarelli-rosset-2012-tree-structured">
<titleInfo>
<title>Tree-Structured Named Entity Recognition on OCR Data: Analysis, Processing and Results</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Dinarelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophie</namePart>
<namePart type="family">Rosset</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2012-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehmet</namePart>
<namePart type="given">Uğur</namePart>
<namePart type="family">Doğan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Istanbul, Turkey</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper we deal with named entity detection on data acquired via OCR process on documents dating from 1890. The resulting corpus is very noisy. We perform an analysis to find possible strategies to overcome errors introduced by the OCR process. We propose a preprocessing procedure in three steps to clean data and correct, at least in part, OCR mistakes. The task is made even harder by the complex tree-structure of named entities annotated on data, we solve this problem however by adopting an effective named entity detection system we proposed in previous work. We evaluate our procedure for preprocessing OCR-ized data in two ways: in terms of perplexity and OOV rate of a language model on development and evaluation data, and in terms of the performance of the named entity detection system on the preprocessed data. The preprocessing procedure results to be effective, allowing to improve by a large margin the system we proposed for the official evaluation campaign on Old Press, and allowing to outperform also the best performing system of the evaluation campaign.</abstract>
<identifier type="citekey">dinarelli-rosset-2012-tree-structured</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2012/pdf/1046_Paper.pdf</url>
</location>
<part>
<date>2012-05</date>
<extent unit="page">
<start>1266</start>
<end>1272</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tree-Structured Named Entity Recognition on OCR Data: Analysis, Processing and Results
%A Dinarelli, Marco
%A Rosset, Sophie
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Doğan, Mehmet Uğur
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)
%D 2012
%8 May
%I European Language Resources Association (ELRA)
%C Istanbul, Turkey
%F dinarelli-rosset-2012-tree-structured
%X In this paper we deal with named entity detection on data acquired via OCR process on documents dating from 1890. The resulting corpus is very noisy. We perform an analysis to find possible strategies to overcome errors introduced by the OCR process. We propose a preprocessing procedure in three steps to clean data and correct, at least in part, OCR mistakes. The task is made even harder by the complex tree-structure of named entities annotated on data, we solve this problem however by adopting an effective named entity detection system we proposed in previous work. We evaluate our procedure for preprocessing OCR-ized data in two ways: in terms of perplexity and OOV rate of a language model on development and evaluation data, and in terms of the performance of the named entity detection system on the preprocessed data. The preprocessing procedure results to be effective, allowing to improve by a large margin the system we proposed for the official evaluation campaign on Old Press, and allowing to outperform also the best performing system of the evaluation campaign.
%U http://www.lrec-conf.org/proceedings/lrec2012/pdf/1046_Paper.pdf
%P 1266-1272
Markdown (Informal)
[Tree-Structured Named Entity Recognition on OCR Data: Analysis, Processing and Results](http://www.lrec-conf.org/proceedings/lrec2012/pdf/1046_Paper.pdf) (Dinarelli & Rosset, LREC 2012)
ACL