@inproceedings{kanjirangat-etal-2024-nlp,
title = "{NLP}{\_}{DI} at {NADI} 2024 shared task: Multi-label {A}rabic Dialect Classifications with an Unsupervised Cross-Encoder",
author = "Kanjirangat, Vani and
Samardzic, Tanja and
Dolamic, Ljiljana and
Rinaldi, Fabio",
editor = "Habash, Nizar and
Bouamor, Houda and
Eskander, Ramy and
Tomeh, Nadi and
Abu Farha, Ibrahim and
Abdelali, Ahmed and
Touileb, Samia and
Hamed, Injy and
Onaizan, Yaser and
Alhafni, Bashar and
Antoun, Wissam and
Khalifa, Salam and
Haddad, Hatem and
Zitouni, Imed and
AlKhamissi, Badr and
Almatham, Rawan and
Mrini, Khalil",
booktitle = "Proceedings of The Second Arabic Natural Language Processing Conference",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.arabicnlp-1.82",
doi = "10.18653/v1/2024.arabicnlp-1.82",
pages = "742--747",
abstract = "We report the approaches submitted to the NADI 2024 Subtask 1: Multi-label country-level Dialect Identification (MLDID). The core part was to adapt the information from multi-class data for a multi-label dialect classification task. We experimented with supervised and unsupervised strategies to tackle the task in this challenging setting. Under the supervised setup, we used the model trained using NADI 2023 data and devised approaches to convert the multi-class predictions to multi-label by using information from the confusion matrix or using calibrated probabilities. Under unsupervised settings, we used the Arabic-based sentence encoders and multilingual cross-encoders to retrieve similar samples from the training set, considering each test input as a query. The associated labels are then assigned to the input query. We also tried different variations, such as co-occurring dialects derived from the provided development set. We obtained the best validation performance of 48.5{\%} F-score using one of the variations with an unsupervised approach and the same approach yielded the best test result of 43.27{\%} (Ranked 2).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kanjirangat-etal-2024-nlp">
<titleInfo>
<title>NLP_DI at NADI 2024 shared task: Multi-label Arabic Dialect Classifications with an Unsupervised Cross-Encoder</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vani</namePart>
<namePart type="family">Kanjirangat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanja</namePart>
<namePart type="family">Samardzic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ljiljana</namePart>
<namePart type="family">Dolamic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabio</namePart>
<namePart type="family">Rinaldi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of The Second Arabic Natural Language Processing Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nizar</namePart>
<namePart type="family">Habash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramy</namePart>
<namePart type="family">Eskander</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadi</namePart>
<namePart type="family">Tomeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ibrahim</namePart>
<namePart type="family">Abu Farha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Abdelali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samia</namePart>
<namePart type="family">Touileb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Injy</namePart>
<namePart type="family">Hamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bashar</namePart>
<namePart type="family">Alhafni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wissam</namePart>
<namePart type="family">Antoun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salam</namePart>
<namePart type="family">Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hatem</namePart>
<namePart type="family">Haddad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Badr</namePart>
<namePart type="family">AlKhamissi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rawan</namePart>
<namePart type="family">Almatham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalil</namePart>
<namePart type="family">Mrini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We report the approaches submitted to the NADI 2024 Subtask 1: Multi-label country-level Dialect Identification (MLDID). The core part was to adapt the information from multi-class data for a multi-label dialect classification task. We experimented with supervised and unsupervised strategies to tackle the task in this challenging setting. Under the supervised setup, we used the model trained using NADI 2023 data and devised approaches to convert the multi-class predictions to multi-label by using information from the confusion matrix or using calibrated probabilities. Under unsupervised settings, we used the Arabic-based sentence encoders and multilingual cross-encoders to retrieve similar samples from the training set, considering each test input as a query. The associated labels are then assigned to the input query. We also tried different variations, such as co-occurring dialects derived from the provided development set. We obtained the best validation performance of 48.5% F-score using one of the variations with an unsupervised approach and the same approach yielded the best test result of 43.27% (Ranked 2).</abstract>
<identifier type="citekey">kanjirangat-etal-2024-nlp</identifier>
<identifier type="doi">10.18653/v1/2024.arabicnlp-1.82</identifier>
<location>
<url>https://aclanthology.org/2024.arabicnlp-1.82</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>742</start>
<end>747</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T NLP_DI at NADI 2024 shared task: Multi-label Arabic Dialect Classifications with an Unsupervised Cross-Encoder
%A Kanjirangat, Vani
%A Samardzic, Tanja
%A Dolamic, Ljiljana
%A Rinaldi, Fabio
%Y Habash, Nizar
%Y Bouamor, Houda
%Y Eskander, Ramy
%Y Tomeh, Nadi
%Y Abu Farha, Ibrahim
%Y Abdelali, Ahmed
%Y Touileb, Samia
%Y Hamed, Injy
%Y Onaizan, Yaser
%Y Alhafni, Bashar
%Y Antoun, Wissam
%Y Khalifa, Salam
%Y Haddad, Hatem
%Y Zitouni, Imed
%Y AlKhamissi, Badr
%Y Almatham, Rawan
%Y Mrini, Khalil
%S Proceedings of The Second Arabic Natural Language Processing Conference
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F kanjirangat-etal-2024-nlp
%X We report the approaches submitted to the NADI 2024 Subtask 1: Multi-label country-level Dialect Identification (MLDID). The core part was to adapt the information from multi-class data for a multi-label dialect classification task. We experimented with supervised and unsupervised strategies to tackle the task in this challenging setting. Under the supervised setup, we used the model trained using NADI 2023 data and devised approaches to convert the multi-class predictions to multi-label by using information from the confusion matrix or using calibrated probabilities. Under unsupervised settings, we used the Arabic-based sentence encoders and multilingual cross-encoders to retrieve similar samples from the training set, considering each test input as a query. The associated labels are then assigned to the input query. We also tried different variations, such as co-occurring dialects derived from the provided development set. We obtained the best validation performance of 48.5% F-score using one of the variations with an unsupervised approach and the same approach yielded the best test result of 43.27% (Ranked 2).
%R 10.18653/v1/2024.arabicnlp-1.82
%U https://aclanthology.org/2024.arabicnlp-1.82
%U https://doi.org/10.18653/v1/2024.arabicnlp-1.82
%P 742-747
Markdown (Informal)
[NLP_DI at NADI 2024 shared task: Multi-label Arabic Dialect Classifications with an Unsupervised Cross-Encoder](https://aclanthology.org/2024.arabicnlp-1.82) (Kanjirangat et al., ArabicNLP-WS 2024)
ACL