@inproceedings{xia-etal-2010-problems,
title = "The Problems of Language Identification within Hugely Multilingual Data Sets",
author = "Xia, Fei and
Lewis, Carrie and
Lewis, William D.",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Rosner, Mike and
Tapias, Daniel",
booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}'10)",
month = may,
year = "2010",
address = "Valletta, Malta",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2010/pdf/921_Paper.pdf",
abstract = "As the data for more and more languages is finding its way into digital form, with an increasing amount of this data being posted to the Web, it has become possible to collect language data from the Web and create large multilingual resources, covering hundreds or even thousands of languages. ODIN, the Online Database of INterlinear text (Lewis, 2006), is such a resource. It currently consists of nearly 200,000 data points for over 1,000 languages, the data for which was harvested from linguistic documents on the Web. We identify a number of issues with language identification for such broad-coverage resources including the lack of training data, ambiguous language names, incomplete language code sets, and incorrect uses of language names and codes. After providing a short overview of existing language code sets maintained by the linguistic community, we discuss what linguists and the linguistic community can do to make the process of language identification easier.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xia-etal-2010-problems">
<titleInfo>
<title>The Problems of Language Identification within Hugely Multilingual Data Sets</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carrie</namePart>
<namePart type="family">Lewis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="given">D</namePart>
<namePart type="family">Lewis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Rosner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Valletta, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>As the data for more and more languages is finding its way into digital form, with an increasing amount of this data being posted to the Web, it has become possible to collect language data from the Web and create large multilingual resources, covering hundreds or even thousands of languages. ODIN, the Online Database of INterlinear text (Lewis, 2006), is such a resource. It currently consists of nearly 200,000 data points for over 1,000 languages, the data for which was harvested from linguistic documents on the Web. We identify a number of issues with language identification for such broad-coverage resources including the lack of training data, ambiguous language names, incomplete language code sets, and incorrect uses of language names and codes. After providing a short overview of existing language code sets maintained by the linguistic community, we discuss what linguists and the linguistic community can do to make the process of language identification easier.</abstract>
<identifier type="citekey">xia-etal-2010-problems</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2010/pdf/921_Paper.pdf</url>
</location>
<part>
<date>2010-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Problems of Language Identification within Hugely Multilingual Data Sets
%A Xia, Fei
%A Lewis, Carrie
%A Lewis, William D.
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Rosner, Mike
%Y Tapias, Daniel
%S Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)
%D 2010
%8 May
%I European Language Resources Association (ELRA)
%C Valletta, Malta
%F xia-etal-2010-problems
%X As the data for more and more languages is finding its way into digital form, with an increasing amount of this data being posted to the Web, it has become possible to collect language data from the Web and create large multilingual resources, covering hundreds or even thousands of languages. ODIN, the Online Database of INterlinear text (Lewis, 2006), is such a resource. It currently consists of nearly 200,000 data points for over 1,000 languages, the data for which was harvested from linguistic documents on the Web. We identify a number of issues with language identification for such broad-coverage resources including the lack of training data, ambiguous language names, incomplete language code sets, and incorrect uses of language names and codes. After providing a short overview of existing language code sets maintained by the linguistic community, we discuss what linguists and the linguistic community can do to make the process of language identification easier.
%U http://www.lrec-conf.org/proceedings/lrec2010/pdf/921_Paper.pdf
Markdown (Informal)
[The Problems of Language Identification within Hugely Multilingual Data Sets](http://www.lrec-conf.org/proceedings/lrec2010/pdf/921_Paper.pdf) (Xia et al., LREC 2010)
ACL