@inproceedings{bast-etal-2021-tokenization,
title = "Tokenization Repair in the Presence of Spelling Errors",
author = "Bast, Hannah and
Hertel, Matthias and
Mohamed, Mostafa M.",
editor = "Bisazza, Arianna and
Abend, Omri",
booktitle = "Proceedings of the 25th Conference on Computational Natural Language Learning",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.conll-1.22/",
doi = "10.18653/v1/2021.conll-1.22",
pages = "279--289",
abstract = "We consider the following tokenization repair problem: Given a natural language text with any combination of missing or spurious spaces, correct these. Spelling errors can be present, but it`s not part of the problem to correct them. For example, given: {\textquotedblleft}Tispa per isabout token izaionrep air{\textquotedblright}, compute {\textquotedblleft}Tis paper is about tokenizaion repair{\textquotedblright}. We identify three key ingredients of high-quality tokenization repair, all missing from previous work: deep language models with a bidirectional component, training the models on text with spelling errors, and making use of the space information already present. Our methods also improve existing spell checkers by fixing not only more tokenization errors but also more spelling errors: once it is clear which characters form a word, it is much easier for them to figure out the correct word. We provide six benchmarks that cover three use cases (OCR errors, text extraction from PDF, human errors) and the cases of partially correct space information and all spaces missing. We evaluate our methods against the best existing methods and a non-trivial baseline. We provide full reproducibility under \url{https://ad.informatik.uni-freiburg.de/publications}."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bast-etal-2021-tokenization">
<titleInfo>
<title>Tokenization Repair in the Presence of Spelling Errors</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hannah</namePart>
<namePart type="family">Bast</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthias</namePart>
<namePart type="family">Hertel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mostafa</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Mohamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 25th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arianna</namePart>
<namePart type="family">Bisazza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omri</namePart>
<namePart type="family">Abend</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We consider the following tokenization repair problem: Given a natural language text with any combination of missing or spurious spaces, correct these. Spelling errors can be present, but it‘s not part of the problem to correct them. For example, given: “Tispa per isabout token izaionrep air”, compute “Tis paper is about tokenizaion repair”. We identify three key ingredients of high-quality tokenization repair, all missing from previous work: deep language models with a bidirectional component, training the models on text with spelling errors, and making use of the space information already present. Our methods also improve existing spell checkers by fixing not only more tokenization errors but also more spelling errors: once it is clear which characters form a word, it is much easier for them to figure out the correct word. We provide six benchmarks that cover three use cases (OCR errors, text extraction from PDF, human errors) and the cases of partially correct space information and all spaces missing. We evaluate our methods against the best existing methods and a non-trivial baseline. We provide full reproducibility under https://ad.informatik.uni-freiburg.de/publications.</abstract>
<identifier type="citekey">bast-etal-2021-tokenization</identifier>
<identifier type="doi">10.18653/v1/2021.conll-1.22</identifier>
<location>
<url>https://aclanthology.org/2021.conll-1.22/</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>279</start>
<end>289</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tokenization Repair in the Presence of Spelling Errors
%A Bast, Hannah
%A Hertel, Matthias
%A Mohamed, Mostafa M.
%Y Bisazza, Arianna
%Y Abend, Omri
%S Proceedings of the 25th Conference on Computational Natural Language Learning
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online
%F bast-etal-2021-tokenization
%X We consider the following tokenization repair problem: Given a natural language text with any combination of missing or spurious spaces, correct these. Spelling errors can be present, but it‘s not part of the problem to correct them. For example, given: “Tispa per isabout token izaionrep air”, compute “Tis paper is about tokenizaion repair”. We identify three key ingredients of high-quality tokenization repair, all missing from previous work: deep language models with a bidirectional component, training the models on text with spelling errors, and making use of the space information already present. Our methods also improve existing spell checkers by fixing not only more tokenization errors but also more spelling errors: once it is clear which characters form a word, it is much easier for them to figure out the correct word. We provide six benchmarks that cover three use cases (OCR errors, text extraction from PDF, human errors) and the cases of partially correct space information and all spaces missing. We evaluate our methods against the best existing methods and a non-trivial baseline. We provide full reproducibility under https://ad.informatik.uni-freiburg.de/publications.
%R 10.18653/v1/2021.conll-1.22
%U https://aclanthology.org/2021.conll-1.22/
%U https://doi.org/10.18653/v1/2021.conll-1.22
%P 279-289
Markdown (Informal)
[Tokenization Repair in the Presence of Spelling Errors](https://aclanthology.org/2021.conll-1.22/) (Bast et al., CoNLL 2021)
ACL