@inproceedings{gomez-etal-2017-discriminating,
title = "Discriminating between Similar Languages Using a Combination of Typed and Untyped Character N-grams and Words",
author = "Gomez, Helena and
Markov, Ilia and
Baptista, Jorge and
Sidorov, Grigori and
Pinto, David",
editor = {Nakov, Preslav and
Zampieri, Marcos and
Ljube\v si\'c, Nikola and
Tiedemann, J\"org and
Malmasi, Shevin and
Ali, Ahmed},
booktitle = "Proceedings of the Fourth Workshop on {NLP} for Similar Languages, Varieties and Dialects ({V}ar{D}ial)",
month = apr,
year = "2017",
address = "Valencia, Spain",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W17-1217/",
doi = "10.18653/v1/W17-1217",
pages = "137--145",
abstract = "This paper presents the cic\_ualg's system that took part in the Discriminating between Similar Languages (DSL) shared task, held at the VarDial 2017 Workshop. This year's task aims at identifying 14 languages across 6 language groups using a corpus of excerpts of journalistic texts. Two classification approaches were compared: a single-step (all languages) approach and a two-step (language group and then languages within the group) approach. Features exploited include lexical features (unigrams of words) and character n-grams. Besides traditional (untyped) character n-grams, we introduce typed character n-grams in the DSL task. Experiments were carried out with different feature representation methods (binary and raw term frequency), frequency threshold values, and machine-learning algorithms -- Support Vector Machines (SVM) and Multinomial Naive Bayes (MNB). Our best run in the DSL task achieved 91.46\% accuracy."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gomez-etal-2017-discriminating">
<titleInfo>
<title>Discriminating between Similar Languages Using a Combination of Typed and Untyped Character N-grams and Words</title>
</titleInfo>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ilia</namePart>
<namePart type="family">Markov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorge</namePart>
<namePart type="family">Baptista</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Grigori</namePart>
<namePart type="family">Sidorov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Pinto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shevin</namePart>
<namePart type="family">Malmasi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Valencia, Spain</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents the cic_ualg’s system that took part in the Discriminating between Similar Languages (DSL) shared task, held at the VarDial 2017 Workshop. This year’s task aims at identifying 14 languages across 6 language groups using a corpus of excerpts of journalistic texts. Two classification approaches were compared: a single-step (all languages) approach and a two-step (language group and then languages within the group) approach. Features exploited include lexical features (unigrams of words) and character n-grams. Besides traditional (untyped) character n-grams, we introduce typed character n-grams in the DSL task. Experiments were carried out with different feature representation methods (binary and raw term frequency), frequency threshold values, and machine-learning algorithms – Support Vector Machines (SVM) and Multinomial Naive Bayes (MNB). Our best run in the DSL task achieved 91.46% accuracy.</abstract>
<identifier type="citekey">gomez-etal-2017-discriminating</identifier>
<identifier type="doi">10.18653/v1/W17-1217</identifier>
<location>
<url>https://aclanthology.org/W17-1217/</url>
</location>
<part>
<date>2017-04</date>
<extent unit="page">
<start>137</start>
<end>145</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Discriminating between Similar Languages Using a Combination of Typed and Untyped Character N-grams and Words
%A Gomez, Helena
%A Markov, Ilia
%A Baptista, Jorge
%A Sidorov, Grigori
%A Pinto, David
%Y Nakov, Preslav
%Y Zampieri, Marcos
%Y Ljubešić, Nikola
%Y Tiedemann, Jörg
%Y Malmasi, Shevin
%Y Ali, Ahmed
%S Proceedings of the Fourth Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial)
%D 2017
%8 April
%I Association for Computational Linguistics
%C Valencia, Spain
%F gomez-etal-2017-discriminating
%X This paper presents the cic_ualg’s system that took part in the Discriminating between Similar Languages (DSL) shared task, held at the VarDial 2017 Workshop. This year’s task aims at identifying 14 languages across 6 language groups using a corpus of excerpts of journalistic texts. Two classification approaches were compared: a single-step (all languages) approach and a two-step (language group and then languages within the group) approach. Features exploited include lexical features (unigrams of words) and character n-grams. Besides traditional (untyped) character n-grams, we introduce typed character n-grams in the DSL task. Experiments were carried out with different feature representation methods (binary and raw term frequency), frequency threshold values, and machine-learning algorithms – Support Vector Machines (SVM) and Multinomial Naive Bayes (MNB). Our best run in the DSL task achieved 91.46% accuracy.
%R 10.18653/v1/W17-1217
%U https://aclanthology.org/W17-1217/
%U https://doi.org/10.18653/v1/W17-1217
%P 137-145
Markdown (Informal)
[Discriminating between Similar Languages Using a Combination of Typed and Untyped Character N-grams and Words](https://aclanthology.org/W17-1217/) (Gomez et al., VarDial 2017)
ACL