@inproceedings{sakr-etal-2024-alexunlp,
title = "{A}lex{UNLP}-{STM} at {NADI} 2024 shared task: Quantifying the {A}rabic Dialect Spectrum with Contrastive Learning, Weighted Sampling, and {BERT}-based Regression Ensemble",
author = "Sakr, Abdelrahman and
Torki, Marwan and
El-Makky, Nagwa",
editor = "Habash, Nizar and
Bouamor, Houda and
Eskander, Ramy and
Tomeh, Nadi and
Abu Farha, Ibrahim and
Abdelali, Ahmed and
Touileb, Samia and
Hamed, Injy and
Onaizan, Yaser and
Alhafni, Bashar and
Antoun, Wissam and
Khalifa, Salam and
Haddad, Hatem and
Zitouni, Imed and
AlKhamissi, Badr and
Almatham, Rawan and
Mrini, Khalil",
booktitle = "Proceedings of The Second Arabic Natural Language Processing Conference",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.arabicnlp-1.81",
doi = "10.18653/v1/2024.arabicnlp-1.81",
pages = "735--741",
abstract = "Recognizing the nuanced spectrum of dialectness in Arabic text poses a significant challenge for natural language processing (NLP) tasks. Traditional dialect identification (DI) methods treat the task as binary, overlooking the continuum of dialect variation present in Arabic speech and text. In this paper, we describe our submission to the NADI shared Task of ArabicNLP 2024. We participated in Subtask 2 - ALDi Estimation, which focuses on estimating the Arabic Level of Dialectness (ALDi) for Arabic text, indicating how much it deviates from Modern Standard Arabic (MSA) on a scale from 0 to 1, where 0 means MSA and 1 means high divergence from MSA. We explore diverse training approaches, including contrastive learning, applying a random weighted sampler along with fine-tuning a regression task based on the AraBERT model, after adding a linear and non-linear layer on top of its pooled output. Finally, performing a brute force ensemble strategy increases the performance of our system. Our proposed solution achieved a Root Mean Squared Error (RMSE) of 0.1406, ranking second on the leaderboard.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sakr-etal-2024-alexunlp">
<titleInfo>
<title>AlexUNLP-STM at NADI 2024 shared task: Quantifying the Arabic Dialect Spectrum with Contrastive Learning, Weighted Sampling, and BERT-based Regression Ensemble</title>
</titleInfo>
<name type="personal">
<namePart type="given">Abdelrahman</namePart>
<namePart type="family">Sakr</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marwan</namePart>
<namePart type="family">Torki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nagwa</namePart>
<namePart type="family">El-Makky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of The Second Arabic Natural Language Processing Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nizar</namePart>
<namePart type="family">Habash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramy</namePart>
<namePart type="family">Eskander</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadi</namePart>
<namePart type="family">Tomeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ibrahim</namePart>
<namePart type="family">Abu Farha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Abdelali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samia</namePart>
<namePart type="family">Touileb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Injy</namePart>
<namePart type="family">Hamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bashar</namePart>
<namePart type="family">Alhafni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wissam</namePart>
<namePart type="family">Antoun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salam</namePart>
<namePart type="family">Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hatem</namePart>
<namePart type="family">Haddad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Badr</namePart>
<namePart type="family">AlKhamissi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rawan</namePart>
<namePart type="family">Almatham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalil</namePart>
<namePart type="family">Mrini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recognizing the nuanced spectrum of dialectness in Arabic text poses a significant challenge for natural language processing (NLP) tasks. Traditional dialect identification (DI) methods treat the task as binary, overlooking the continuum of dialect variation present in Arabic speech and text. In this paper, we describe our submission to the NADI shared Task of ArabicNLP 2024. We participated in Subtask 2 - ALDi Estimation, which focuses on estimating the Arabic Level of Dialectness (ALDi) for Arabic text, indicating how much it deviates from Modern Standard Arabic (MSA) on a scale from 0 to 1, where 0 means MSA and 1 means high divergence from MSA. We explore diverse training approaches, including contrastive learning, applying a random weighted sampler along with fine-tuning a regression task based on the AraBERT model, after adding a linear and non-linear layer on top of its pooled output. Finally, performing a brute force ensemble strategy increases the performance of our system. Our proposed solution achieved a Root Mean Squared Error (RMSE) of 0.1406, ranking second on the leaderboard.</abstract>
<identifier type="citekey">sakr-etal-2024-alexunlp</identifier>
<identifier type="doi">10.18653/v1/2024.arabicnlp-1.81</identifier>
<location>
<url>https://aclanthology.org/2024.arabicnlp-1.81</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>735</start>
<end>741</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AlexUNLP-STM at NADI 2024 shared task: Quantifying the Arabic Dialect Spectrum with Contrastive Learning, Weighted Sampling, and BERT-based Regression Ensemble
%A Sakr, Abdelrahman
%A Torki, Marwan
%A El-Makky, Nagwa
%Y Habash, Nizar
%Y Bouamor, Houda
%Y Eskander, Ramy
%Y Tomeh, Nadi
%Y Abu Farha, Ibrahim
%Y Abdelali, Ahmed
%Y Touileb, Samia
%Y Hamed, Injy
%Y Onaizan, Yaser
%Y Alhafni, Bashar
%Y Antoun, Wissam
%Y Khalifa, Salam
%Y Haddad, Hatem
%Y Zitouni, Imed
%Y AlKhamissi, Badr
%Y Almatham, Rawan
%Y Mrini, Khalil
%S Proceedings of The Second Arabic Natural Language Processing Conference
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F sakr-etal-2024-alexunlp
%X Recognizing the nuanced spectrum of dialectness in Arabic text poses a significant challenge for natural language processing (NLP) tasks. Traditional dialect identification (DI) methods treat the task as binary, overlooking the continuum of dialect variation present in Arabic speech and text. In this paper, we describe our submission to the NADI shared Task of ArabicNLP 2024. We participated in Subtask 2 - ALDi Estimation, which focuses on estimating the Arabic Level of Dialectness (ALDi) for Arabic text, indicating how much it deviates from Modern Standard Arabic (MSA) on a scale from 0 to 1, where 0 means MSA and 1 means high divergence from MSA. We explore diverse training approaches, including contrastive learning, applying a random weighted sampler along with fine-tuning a regression task based on the AraBERT model, after adding a linear and non-linear layer on top of its pooled output. Finally, performing a brute force ensemble strategy increases the performance of our system. Our proposed solution achieved a Root Mean Squared Error (RMSE) of 0.1406, ranking second on the leaderboard.
%R 10.18653/v1/2024.arabicnlp-1.81
%U https://aclanthology.org/2024.arabicnlp-1.81
%U https://doi.org/10.18653/v1/2024.arabicnlp-1.81
%P 735-741
Markdown (Informal)
[AlexUNLP-STM at NADI 2024 shared task: Quantifying the Arabic Dialect Spectrum with Contrastive Learning, Weighted Sampling, and BERT-based Regression Ensemble](https://aclanthology.org/2024.arabicnlp-1.81) (Sakr et al., ArabicNLP-WS 2024)
ACL