@inproceedings{boros-etal-2020-alleviating,
title = "Alleviating Digitization Errors in Named Entity Recognition for Historical Documents",
author = "Boros, Emanuela and
Hamdi, Ahmed and
Linhares Pontes, Elvys and
Cabrera-Diego, Luis Adri{\'a}n and
Moreno, Jose G. and
Sidere, Nicolas and
Doucet, Antoine",
editor = "Fern{\'a}ndez, Raquel and
Linzen, Tal",
booktitle = "Proceedings of the 24th Conference on Computational Natural Language Learning",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.conll-1.35",
doi = "10.18653/v1/2020.conll-1.35",
pages = "431--441",
abstract = "This paper tackles the task of named entity recognition (NER) applied to digitized historical texts obtained from processing digital images of newspapers using optical character recognition (OCR) techniques. We argue that the main challenge for this task is that the OCR process leads to misspellings and linguistic errors in the output text. Moreover, historical variations can be present in aged documents, which can impact the performance of the NER process. We conduct a comparative evaluation on two historical datasets in German and French against previous state-of-the-art models, and we propose a model based on a hierarchical stack of Transformers to approach the NER task for historical data. Our findings show that the proposed model clearly improves the results on both historical datasets, and does not degrade the results for modern datasets.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="boros-etal-2020-alleviating">
<titleInfo>
<title>Alleviating Digitization Errors in Named Entity Recognition for Historical Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Emanuela</namePart>
<namePart type="family">Boros</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Hamdi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elvys</namePart>
<namePart type="family">Linhares Pontes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="given">Adrián</namePart>
<namePart type="family">Cabrera-Diego</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jose</namePart>
<namePart type="given">G</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicolas</namePart>
<namePart type="family">Sidere</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antoine</namePart>
<namePart type="family">Doucet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 24th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Raquel</namePart>
<namePart type="family">Fernández</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tal</namePart>
<namePart type="family">Linzen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper tackles the task of named entity recognition (NER) applied to digitized historical texts obtained from processing digital images of newspapers using optical character recognition (OCR) techniques. We argue that the main challenge for this task is that the OCR process leads to misspellings and linguistic errors in the output text. Moreover, historical variations can be present in aged documents, which can impact the performance of the NER process. We conduct a comparative evaluation on two historical datasets in German and French against previous state-of-the-art models, and we propose a model based on a hierarchical stack of Transformers to approach the NER task for historical data. Our findings show that the proposed model clearly improves the results on both historical datasets, and does not degrade the results for modern datasets.</abstract>
<identifier type="citekey">boros-etal-2020-alleviating</identifier>
<identifier type="doi">10.18653/v1/2020.conll-1.35</identifier>
<location>
<url>https://aclanthology.org/2020.conll-1.35</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>431</start>
<end>441</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Alleviating Digitization Errors in Named Entity Recognition for Historical Documents
%A Boros, Emanuela
%A Hamdi, Ahmed
%A Linhares Pontes, Elvys
%A Cabrera-Diego, Luis Adrián
%A Moreno, Jose G.
%A Sidere, Nicolas
%A Doucet, Antoine
%Y Fernández, Raquel
%Y Linzen, Tal
%S Proceedings of the 24th Conference on Computational Natural Language Learning
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F boros-etal-2020-alleviating
%X This paper tackles the task of named entity recognition (NER) applied to digitized historical texts obtained from processing digital images of newspapers using optical character recognition (OCR) techniques. We argue that the main challenge for this task is that the OCR process leads to misspellings and linguistic errors in the output text. Moreover, historical variations can be present in aged documents, which can impact the performance of the NER process. We conduct a comparative evaluation on two historical datasets in German and French against previous state-of-the-art models, and we propose a model based on a hierarchical stack of Transformers to approach the NER task for historical data. Our findings show that the proposed model clearly improves the results on both historical datasets, and does not degrade the results for modern datasets.
%R 10.18653/v1/2020.conll-1.35
%U https://aclanthology.org/2020.conll-1.35
%U https://doi.org/10.18653/v1/2020.conll-1.35
%P 431-441
Markdown (Informal)
[Alleviating Digitization Errors in Named Entity Recognition for Historical Documents](https://aclanthology.org/2020.conll-1.35) (Boros et al., CoNLL 2020)
ACL