@inproceedings{reynaert-2006-corpus,
title = "Corpus-Induced Corpus Clean-up",
author = "Reynaert, Martin",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Gangemi, Aldo and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Tapias, Daniel",
booktitle = "Proceedings of the Fifth International Conference on Language Resources and Evaluation ({LREC}{'}06)",
month = may,
year = "2006",
address = "Genoa, Italy",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2006/pdf/229_pdf.pdf",
abstract = "We explore the feasibility of using only unsupervised means to identify non-words, i.e. typos, in a frequency list derived from a large corpus of Dutch and to distinguish between these non-words and real-words in the language. We call the system we built and evaluate in this paper ciccl, which stands for Corpus-Induced Corpus Clean-up. The algorithm on which ciccl is primarily based is the anagram-key hashing algorithm introduced by (Reynaert, 2004). The core correction mechanism is a simple and effective method which translates the actual characters which make up a word into a large natural number in such a way that all the anagrams, i.e. all the words composed of precisely the same subset of characters, are allocated the same natural number. In effect, this constitutes a novel approximate string matching algorithm for indexed text search. This is because by simple addition, subtraction or a combination of both, all variants within reach of the range of numerical values defined in the alphabet are retrieved by iterating over the alphabet. ciccl's input consists primarily of corpus derived frequency lists, from which it derives valuable morphological information by performing frequency counts over the substrings of the words, which are then used to perform decompounding, as well as for distinguishing between most likely correctly spelled words and typos.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="reynaert-2006-corpus">
<titleInfo>
<title>Corpus-Induced Corpus Clean-up</title>
</titleInfo>
<name type="personal">
<namePart type="given">Martin</namePart>
<namePart type="family">Reynaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2006-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC’06)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aldo</namePart>
<namePart type="family">Gangemi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Genoa, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We explore the feasibility of using only unsupervised means to identify non-words, i.e. typos, in a frequency list derived from a large corpus of Dutch and to distinguish between these non-words and real-words in the language. We call the system we built and evaluate in this paper ciccl, which stands for Corpus-Induced Corpus Clean-up. The algorithm on which ciccl is primarily based is the anagram-key hashing algorithm introduced by (Reynaert, 2004). The core correction mechanism is a simple and effective method which translates the actual characters which make up a word into a large natural number in such a way that all the anagrams, i.e. all the words composed of precisely the same subset of characters, are allocated the same natural number. In effect, this constitutes a novel approximate string matching algorithm for indexed text search. This is because by simple addition, subtraction or a combination of both, all variants within reach of the range of numerical values defined in the alphabet are retrieved by iterating over the alphabet. ciccl’s input consists primarily of corpus derived frequency lists, from which it derives valuable morphological information by performing frequency counts over the substrings of the words, which are then used to perform decompounding, as well as for distinguishing between most likely correctly spelled words and typos.</abstract>
<identifier type="citekey">reynaert-2006-corpus</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2006/pdf/229_pdf.pdf</url>
</location>
<part>
<date>2006-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Corpus-Induced Corpus Clean-up
%A Reynaert, Martin
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Gangemi, Aldo
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Tapias, Daniel
%S Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC’06)
%D 2006
%8 May
%I European Language Resources Association (ELRA)
%C Genoa, Italy
%F reynaert-2006-corpus
%X We explore the feasibility of using only unsupervised means to identify non-words, i.e. typos, in a frequency list derived from a large corpus of Dutch and to distinguish between these non-words and real-words in the language. We call the system we built and evaluate in this paper ciccl, which stands for Corpus-Induced Corpus Clean-up. The algorithm on which ciccl is primarily based is the anagram-key hashing algorithm introduced by (Reynaert, 2004). The core correction mechanism is a simple and effective method which translates the actual characters which make up a word into a large natural number in such a way that all the anagrams, i.e. all the words composed of precisely the same subset of characters, are allocated the same natural number. In effect, this constitutes a novel approximate string matching algorithm for indexed text search. This is because by simple addition, subtraction or a combination of both, all variants within reach of the range of numerical values defined in the alphabet are retrieved by iterating over the alphabet. ciccl’s input consists primarily of corpus derived frequency lists, from which it derives valuable morphological information by performing frequency counts over the substrings of the words, which are then used to perform decompounding, as well as for distinguishing between most likely correctly spelled words and typos.
%U http://www.lrec-conf.org/proceedings/lrec2006/pdf/229_pdf.pdf
Markdown (Informal)
[Corpus-Induced Corpus Clean-up](http://www.lrec-conf.org/proceedings/lrec2006/pdf/229_pdf.pdf) (Reynaert, LREC 2006)
ACL
- Martin Reynaert. 2006. Corpus-Induced Corpus Clean-up. In Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC’06), Genoa, Italy. European Language Resources Association (ELRA).