@inproceedings{konstantopoulos-2010-learning,
title = "Learning Language Identification Models: A Comparative Analysis of the Distinctive Features of Names and Common Words",
author = "Konstantopoulos, Stasinos",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Rosner, Mike and
Tapias, Daniel",
booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}'10)",
month = may,
year = "2010",
address = "Valletta, Malta",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2010/pdf/452_Paper.pdf",
abstract = "The intuition and basic hypothesis that this paper explores is that names are more characteristic of their language than common words are, and that a single name can have enough clues to confidently identify its language where random text of the same length wouldn't. To test this hypothesis, n-gramm modelling is used to learn language models which identify the language of isolated names and equally short document fragments. As the empirical results corroborate the prior intuition, an explanation is sought for the higher accuracy at which the language of names can be identified. The results of the application of these models, as well as the models themselves, are quantitatively and qualitatively analysed and a hypothesis is formed about the explanation of this difference. The conclusions derived are both technologically useful in information extraction or text-to-speech tasks, and theoretically interesting as a tool for improving our understanding of the morphology and phonology of the languages involved in the experiments.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="konstantopoulos-2010-learning">
<titleInfo>
<title>Learning Language Identification Models: A Comparative Analysis of the Distinctive Features of Names and Common Words</title>
</titleInfo>
<name type="personal">
<namePart type="given">Stasinos</namePart>
<namePart type="family">Konstantopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Rosner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Valletta, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The intuition and basic hypothesis that this paper explores is that names are more characteristic of their language than common words are, and that a single name can have enough clues to confidently identify its language where random text of the same length wouldn’t. To test this hypothesis, n-gramm modelling is used to learn language models which identify the language of isolated names and equally short document fragments. As the empirical results corroborate the prior intuition, an explanation is sought for the higher accuracy at which the language of names can be identified. The results of the application of these models, as well as the models themselves, are quantitatively and qualitatively analysed and a hypothesis is formed about the explanation of this difference. The conclusions derived are both technologically useful in information extraction or text-to-speech tasks, and theoretically interesting as a tool for improving our understanding of the morphology and phonology of the languages involved in the experiments.</abstract>
<identifier type="citekey">konstantopoulos-2010-learning</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2010/pdf/452_Paper.pdf</url>
</location>
<part>
<date>2010-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning Language Identification Models: A Comparative Analysis of the Distinctive Features of Names and Common Words
%A Konstantopoulos, Stasinos
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Rosner, Mike
%Y Tapias, Daniel
%S Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)
%D 2010
%8 May
%I European Language Resources Association (ELRA)
%C Valletta, Malta
%F konstantopoulos-2010-learning
%X The intuition and basic hypothesis that this paper explores is that names are more characteristic of their language than common words are, and that a single name can have enough clues to confidently identify its language where random text of the same length wouldn’t. To test this hypothesis, n-gramm modelling is used to learn language models which identify the language of isolated names and equally short document fragments. As the empirical results corroborate the prior intuition, an explanation is sought for the higher accuracy at which the language of names can be identified. The results of the application of these models, as well as the models themselves, are quantitatively and qualitatively analysed and a hypothesis is formed about the explanation of this difference. The conclusions derived are both technologically useful in information extraction or text-to-speech tasks, and theoretically interesting as a tool for improving our understanding of the morphology and phonology of the languages involved in the experiments.
%U http://www.lrec-conf.org/proceedings/lrec2010/pdf/452_Paper.pdf
Markdown (Informal)
[Learning Language Identification Models: A Comparative Analysis of the Distinctive Features of Names and Common Words](http://www.lrec-conf.org/proceedings/lrec2010/pdf/452_Paper.pdf) (Konstantopoulos, LREC 2010)
ACL