@inproceedings{althobaiti-2021-country,
title = "Country-level {A}rabic Dialect Identification Using Small Datasets with Integrated Machine Learning Techniques and Deep Learning Models",
author = "Althobaiti, Maha J.",
editor = "Habash, Nizar and
Bouamor, Houda and
Hajj, Hazem and
Magdy, Walid and
Zaghouani, Wajdi and
Bougares, Fethi and
Tomeh, Nadi and
Abu Farha, Ibrahim and
Touileb, Samia",
booktitle = "Proceedings of the Sixth Arabic Natural Language Processing Workshop",
month = apr,
year = "2021",
address = "Kyiv, Ukraine (Virtual)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.wanlp-1.30",
pages = "265--270",
abstract = "Arabic is characterised by a considerable number of varieties including spoken dialects. In this paper, we presented our models developed to participate in the NADI subtask 1.2 that requires building a system to distinguish between 21 country-level dialects. We investigated several classical machine learning approaches and deep learning models using small datasets. We examined an integration technique between two machine learning approaches. Additionally, we created dictionaries automatically based on Pointwise Mutual Information and labelled datasets, which enriched the feature space when training models. A semi-supervised learning approach was also examined and compared to other methods that exploit large unlabelled datasets, such as building pre-trained word embeddings. Our winning model was the Support Vector Machine with dictionary-based features and Pointwise Mutual Information values, achieving an 18.94{\%} macros-average F1-score.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="althobaiti-2021-country">
<titleInfo>
<title>Country-level Arabic Dialect Identification Using Small Datasets with Integrated Machine Learning Techniques and Deep Learning Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maha</namePart>
<namePart type="given">J</namePart>
<namePart type="family">Althobaiti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Arabic Natural Language Processing Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nizar</namePart>
<namePart type="family">Habash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hazem</namePart>
<namePart type="family">Hajj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Walid</namePart>
<namePart type="family">Magdy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wajdi</namePart>
<namePart type="family">Zaghouani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fethi</namePart>
<namePart type="family">Bougares</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadi</namePart>
<namePart type="family">Tomeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ibrahim</namePart>
<namePart type="family">Abu Farha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samia</namePart>
<namePart type="family">Touileb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Kyiv, Ukraine (Virtual)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Arabic is characterised by a considerable number of varieties including spoken dialects. In this paper, we presented our models developed to participate in the NADI subtask 1.2 that requires building a system to distinguish between 21 country-level dialects. We investigated several classical machine learning approaches and deep learning models using small datasets. We examined an integration technique between two machine learning approaches. Additionally, we created dictionaries automatically based on Pointwise Mutual Information and labelled datasets, which enriched the feature space when training models. A semi-supervised learning approach was also examined and compared to other methods that exploit large unlabelled datasets, such as building pre-trained word embeddings. Our winning model was the Support Vector Machine with dictionary-based features and Pointwise Mutual Information values, achieving an 18.94% macros-average F1-score.</abstract>
<identifier type="citekey">althobaiti-2021-country</identifier>
<location>
<url>https://aclanthology.org/2021.wanlp-1.30</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>265</start>
<end>270</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Country-level Arabic Dialect Identification Using Small Datasets with Integrated Machine Learning Techniques and Deep Learning Models
%A Althobaiti, Maha J.
%Y Habash, Nizar
%Y Bouamor, Houda
%Y Hajj, Hazem
%Y Magdy, Walid
%Y Zaghouani, Wajdi
%Y Bougares, Fethi
%Y Tomeh, Nadi
%Y Abu Farha, Ibrahim
%Y Touileb, Samia
%S Proceedings of the Sixth Arabic Natural Language Processing Workshop
%D 2021
%8 April
%I Association for Computational Linguistics
%C Kyiv, Ukraine (Virtual)
%F althobaiti-2021-country
%X Arabic is characterised by a considerable number of varieties including spoken dialects. In this paper, we presented our models developed to participate in the NADI subtask 1.2 that requires building a system to distinguish between 21 country-level dialects. We investigated several classical machine learning approaches and deep learning models using small datasets. We examined an integration technique between two machine learning approaches. Additionally, we created dictionaries automatically based on Pointwise Mutual Information and labelled datasets, which enriched the feature space when training models. A semi-supervised learning approach was also examined and compared to other methods that exploit large unlabelled datasets, such as building pre-trained word embeddings. Our winning model was the Support Vector Machine with dictionary-based features and Pointwise Mutual Information values, achieving an 18.94% macros-average F1-score.
%U https://aclanthology.org/2021.wanlp-1.30
%P 265-270
Markdown (Informal)
[Country-level Arabic Dialect Identification Using Small Datasets with Integrated Machine Learning Techniques and Deep Learning Models](https://aclanthology.org/2021.wanlp-1.30) (Althobaiti, WANLP 2021)
ACL