@inproceedings{mcnamee-2016-language,
title = "Language and Dialect Discrimination Using Compression-Inspired Language Models",
author = "McNamee, Paul",
editor = {Nakov, Preslav and
Zampieri, Marcos and
Tan, Liling and
Ljube\v si\'c, Nikola and
Tiedemann, J\"org and
Malmasi, Shervin},
booktitle = "Proceedings of the Third Workshop on {NLP} for Similar Languages, Varieties and Dialects ({V}ar{D}ial3)",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://aclanthology.org/W16-4825/",
pages = "195--203",
abstract = "The DSL 2016 shared task continued previous evaluations from 2014 and 2015 that facilitated the study of automated language and dialect identification. This paper describes results for this year's shared task and from several related experiments conducted at the Johns Hopkins University Human Language Technology Center of Excellence (JHU HLTCOE). Previously the HLTCOE has explored the use of compression-inspired language modeling for language and dialect identification, using news, Wikipedia, blog post, and Twitter corpora. The technique we have relied upon is based on prediction by partial matching (PPM), a state of the art text compression technique. Due to the close relationship between adaptive compression and language modeling, such compression techniques can also be applied to multi-way text classification problems, and previous studies have examined tasks such as authorship attribution, email spam detection, and topical classification. We applied our approach to the multi-class decision that considered each dialect or language as a possibility for the given shared task input line. Results for test-set A were in accord with our expectations, however results for test-sets B and C appear to be markedly worse. We had not anticipated the inclusion of multiple communications in differing languages in test-set B (social media) input lines, and had not expected the test-set C (dialectal Arabic) data to be represented phonetically instead of in native orthography."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mcnamee-2016-language">
<titleInfo>
<title>Language and Dialect Discrimination Using Compression-Inspired Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">McNamee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial3)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liling</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shervin</namePart>
<namePart type="family">Malmasi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The COLING 2016 Organizing Committee</publisher>
<place>
<placeTerm type="text">Osaka, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The DSL 2016 shared task continued previous evaluations from 2014 and 2015 that facilitated the study of automated language and dialect identification. This paper describes results for this year’s shared task and from several related experiments conducted at the Johns Hopkins University Human Language Technology Center of Excellence (JHU HLTCOE). Previously the HLTCOE has explored the use of compression-inspired language modeling for language and dialect identification, using news, Wikipedia, blog post, and Twitter corpora. The technique we have relied upon is based on prediction by partial matching (PPM), a state of the art text compression technique. Due to the close relationship between adaptive compression and language modeling, such compression techniques can also be applied to multi-way text classification problems, and previous studies have examined tasks such as authorship attribution, email spam detection, and topical classification. We applied our approach to the multi-class decision that considered each dialect or language as a possibility for the given shared task input line. Results for test-set A were in accord with our expectations, however results for test-sets B and C appear to be markedly worse. We had not anticipated the inclusion of multiple communications in differing languages in test-set B (social media) input lines, and had not expected the test-set C (dialectal Arabic) data to be represented phonetically instead of in native orthography.</abstract>
<identifier type="citekey">mcnamee-2016-language</identifier>
<location>
<url>https://aclanthology.org/W16-4825/</url>
</location>
<part>
<date>2016-12</date>
<extent unit="page">
<start>195</start>
<end>203</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Language and Dialect Discrimination Using Compression-Inspired Language Models
%A McNamee, Paul
%Y Nakov, Preslav
%Y Zampieri, Marcos
%Y Tan, Liling
%Y Ljubešić, Nikola
%Y Tiedemann, Jörg
%Y Malmasi, Shervin
%S Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial3)
%D 2016
%8 December
%I The COLING 2016 Organizing Committee
%C Osaka, Japan
%F mcnamee-2016-language
%X The DSL 2016 shared task continued previous evaluations from 2014 and 2015 that facilitated the study of automated language and dialect identification. This paper describes results for this year’s shared task and from several related experiments conducted at the Johns Hopkins University Human Language Technology Center of Excellence (JHU HLTCOE). Previously the HLTCOE has explored the use of compression-inspired language modeling for language and dialect identification, using news, Wikipedia, blog post, and Twitter corpora. The technique we have relied upon is based on prediction by partial matching (PPM), a state of the art text compression technique. Due to the close relationship between adaptive compression and language modeling, such compression techniques can also be applied to multi-way text classification problems, and previous studies have examined tasks such as authorship attribution, email spam detection, and topical classification. We applied our approach to the multi-class decision that considered each dialect or language as a possibility for the given shared task input line. Results for test-set A were in accord with our expectations, however results for test-sets B and C appear to be markedly worse. We had not anticipated the inclusion of multiple communications in differing languages in test-set B (social media) input lines, and had not expected the test-set C (dialectal Arabic) data to be represented phonetically instead of in native orthography.
%U https://aclanthology.org/W16-4825/
%P 195-203
Markdown (Informal)
[Language and Dialect Discrimination Using Compression-Inspired Language Models](https://aclanthology.org/W16-4825/) (McNamee, VarDial 2016)
ACL