@inproceedings{camposampiero-etal-2022-curious,
title = "The Curious Case of Logistic Regression for {I}talian Languages and Dialects Identification",
author = "Camposampiero, Giacomo and
Nguyen, Quynh Anh and
Di Stefano, Francesco",
editor = {Scherrer, Yves and
Jauhiainen, Tommi and
Ljube{\v{s}}i{\'c}, Nikola and
Nakov, Preslav and
Tiedemann, J{\"o}rg and
Zampieri, Marcos},
booktitle = "Proceedings of the Ninth Workshop on NLP for Similar Languages, Varieties and Dialects",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.vardial-1.10",
pages = "86--98",
abstract = "Automatic Language Identification represents an important task for improving many real-world applications such as opinion mining and machine translation. In the case of closely-related languages such as regional dialects, this task is often challenging. In this paper, we propose an extensive evaluation of different approaches for the identification of Italian dialects and languages, spanning from classical machine learning models to more complex neural architectures and state-of-the-art pre-trained language models. Surprisingly, shallow machine learning models managed to outperform huge pre-trained language models in this specific task. This work was developed in the context of the Identification of Languages and Dialects of Italy (ITDI) task organised at VarDial 2022 Evaluation Campaign. Our best submission managed to achieve a weighted F1-score of 0.6880, ranking 5th out of 9 final submissions.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="camposampiero-etal-2022-curious">
<titleInfo>
<title>The Curious Case of Logistic Regression for Italian Languages and Dialects Identification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giacomo</namePart>
<namePart type="family">Camposampiero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Quynh</namePart>
<namePart type="given">Anh</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francesco</namePart>
<namePart type="family">Di Stefano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth Workshop on NLP for Similar Languages, Varieties and Dialects</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yves</namePart>
<namePart type="family">Scherrer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tommi</namePart>
<namePart type="family">Jauhiainen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic Language Identification represents an important task for improving many real-world applications such as opinion mining and machine translation. In the case of closely-related languages such as regional dialects, this task is often challenging. In this paper, we propose an extensive evaluation of different approaches for the identification of Italian dialects and languages, spanning from classical machine learning models to more complex neural architectures and state-of-the-art pre-trained language models. Surprisingly, shallow machine learning models managed to outperform huge pre-trained language models in this specific task. This work was developed in the context of the Identification of Languages and Dialects of Italy (ITDI) task organised at VarDial 2022 Evaluation Campaign. Our best submission managed to achieve a weighted F1-score of 0.6880, ranking 5th out of 9 final submissions.</abstract>
<identifier type="citekey">camposampiero-etal-2022-curious</identifier>
<location>
<url>https://aclanthology.org/2022.vardial-1.10</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>86</start>
<end>98</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Curious Case of Logistic Regression for Italian Languages and Dialects Identification
%A Camposampiero, Giacomo
%A Nguyen, Quynh Anh
%A Di Stefano, Francesco
%Y Scherrer, Yves
%Y Jauhiainen, Tommi
%Y Ljubešić, Nikola
%Y Nakov, Preslav
%Y Tiedemann, Jörg
%Y Zampieri, Marcos
%S Proceedings of the Ninth Workshop on NLP for Similar Languages, Varieties and Dialects
%D 2022
%8 October
%I Association for Computational Linguistics
%C Gyeongju, Republic of Korea
%F camposampiero-etal-2022-curious
%X Automatic Language Identification represents an important task for improving many real-world applications such as opinion mining and machine translation. In the case of closely-related languages such as regional dialects, this task is often challenging. In this paper, we propose an extensive evaluation of different approaches for the identification of Italian dialects and languages, spanning from classical machine learning models to more complex neural architectures and state-of-the-art pre-trained language models. Surprisingly, shallow machine learning models managed to outperform huge pre-trained language models in this specific task. This work was developed in the context of the Identification of Languages and Dialects of Italy (ITDI) task organised at VarDial 2022 Evaluation Campaign. Our best submission managed to achieve a weighted F1-score of 0.6880, ranking 5th out of 9 final submissions.
%U https://aclanthology.org/2022.vardial-1.10
%P 86-98
Markdown (Informal)
[The Curious Case of Logistic Regression for Italian Languages and Dialects Identification](https://aclanthology.org/2022.vardial-1.10) (Camposampiero et al., VarDial 2022)
ACL