@inproceedings{vatanen-etal-2010-language,
title = "Language Identification of Short Text Segments with N-gram Models",
author = {Vatanen, Tommi and
V{\"a}yrynen, Jaakko J. and
Virpioja, Sami},
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Rosner, Mike and
Tapias, Daniel",
booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}`10)",
month = may,
year = "2010",
address = "Valletta, Malta",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L10-1193/",
abstract = "There are many accurate methods for language identification of long text samples, but identification of very short strings still presents a challenge. This paper studies a language identification task, in which the test samples have only 5-21 characters. We compare two distinct methods that are well suited for this task: a naive Bayes classifier based on character n-gram models, and the ranking method by Cavnar and Trenkle (1994). For the n-gram models, we test several standard smoothing techniques, including the current state-of-the-art, the modified Kneser-Ney interpolation. Experiments are conducted with 281 languages using the Universal Declaration of Human Rights. Advanced language model smoothing techniques improve the identification accuracy and the respective classifiers outperform the ranking method. The higher accuracy is obtained at the cost of larger models and slower classification speed. However, there are several methods to reduce the size of an n-gram model, and our experiments with model pruning show that it provides an easy way to balance the size and the identification accuracy. We also compare the results to the language identifier in Google AJAX Language API, using a subset of 50 languages."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vatanen-etal-2010-language">
<titleInfo>
<title>Language Identification of Short Text Segments with N-gram Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tommi</namePart>
<namePart type="family">Vatanen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaakko</namePart>
<namePart type="given">J</namePart>
<namePart type="family">Väyrynen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sami</namePart>
<namePart type="family">Virpioja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC‘10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Rosner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Valletta, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>There are many accurate methods for language identification of long text samples, but identification of very short strings still presents a challenge. This paper studies a language identification task, in which the test samples have only 5-21 characters. We compare two distinct methods that are well suited for this task: a naive Bayes classifier based on character n-gram models, and the ranking method by Cavnar and Trenkle (1994). For the n-gram models, we test several standard smoothing techniques, including the current state-of-the-art, the modified Kneser-Ney interpolation. Experiments are conducted with 281 languages using the Universal Declaration of Human Rights. Advanced language model smoothing techniques improve the identification accuracy and the respective classifiers outperform the ranking method. The higher accuracy is obtained at the cost of larger models and slower classification speed. However, there are several methods to reduce the size of an n-gram model, and our experiments with model pruning show that it provides an easy way to balance the size and the identification accuracy. We also compare the results to the language identifier in Google AJAX Language API, using a subset of 50 languages.</abstract>
<identifier type="citekey">vatanen-etal-2010-language</identifier>
<location>
<url>https://aclanthology.org/L10-1193/</url>
</location>
<part>
<date>2010-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Language Identification of Short Text Segments with N-gram Models
%A Vatanen, Tommi
%A Väyrynen, Jaakko J.
%A Virpioja, Sami
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Rosner, Mike
%Y Tapias, Daniel
%S Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC‘10)
%D 2010
%8 May
%I European Language Resources Association (ELRA)
%C Valletta, Malta
%F vatanen-etal-2010-language
%X There are many accurate methods for language identification of long text samples, but identification of very short strings still presents a challenge. This paper studies a language identification task, in which the test samples have only 5-21 characters. We compare two distinct methods that are well suited for this task: a naive Bayes classifier based on character n-gram models, and the ranking method by Cavnar and Trenkle (1994). For the n-gram models, we test several standard smoothing techniques, including the current state-of-the-art, the modified Kneser-Ney interpolation. Experiments are conducted with 281 languages using the Universal Declaration of Human Rights. Advanced language model smoothing techniques improve the identification accuracy and the respective classifiers outperform the ranking method. The higher accuracy is obtained at the cost of larger models and slower classification speed. However, there are several methods to reduce the size of an n-gram model, and our experiments with model pruning show that it provides an easy way to balance the size and the identification accuracy. We also compare the results to the language identifier in Google AJAX Language API, using a subset of 50 languages.
%U https://aclanthology.org/L10-1193/
Markdown (Informal)
[Language Identification of Short Text Segments with N-gram Models](https://aclanthology.org/L10-1193/) (Vatanen et al., LREC 2010)
ACL