@inproceedings{talafha-etal-2024-casablanca,
title = "{C}asablanca: Data and Models for Multidialectal {A}rabic Speech Recognition",
author = "Talafha, Bashar and
Kadaoui, Karima and
Magdy, Samar Mohamed and
Habiboullah, Mariem and
Chafei, Chafei Mohamed and
El-Shangiti, Ahmed Oumar and
Zayed, Hiba and
Tourad, Mohamedou Cheikh and
Alhamouri, Rahaf and
Assi, Rwaa and
Alraeesi, Aisha and
Mohamed, Hour and
Alwajih, Fakhraddin and
Mohamed, Abdelrahman and
El Mekki, Abdellah and
Nagoudi, El Moatez Billah and
Saadia, Benelhadj Djelloul Mama and
Alsayadi, Hamzah A. and
Al-Dhabyani, Walid and
Shatnawi, Sara and
Ech-chammakhy, Yasir and
Makouar, Amal and
Berrachedi, Yousra and
Jarrar, Mustafa and
Shehata, Shady and
Berrada, Ismail and
Abdul-Mageed, Muhammad",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1211",
doi = "10.18653/v1/2024.emnlp-main.1211",
pages = "21745--21758",
abstract = "In spite of the recent progress in speech processing, the majority of world languages and dialects remain uncovered. This situation only furthers an already wide technological divide, thereby hindering technological and socioeconomic inclusion. This challenge is largely due to the absence of datasets that can empower diverse speech systems. In this paper, we seek to mitigate this obstacle for a number of Arabic dialects by presenting Casablanca, a large-scale community-driven effort to collect and transcribe a multi-dialectal Arabic dataset. The dataset covers eight dialects: Algerian, Egyptian, Emirati, Jordanian, Mauritanian, Moroccan, Palestinian, and Yemeni, and includes annotations for transcription, gender, dialect, and code-switching. We also develop a number of strong baselines exploiting Casablanca. The project page for Casablanca is accessible at: www.dlnlp.ai/speech/casablanca.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="talafha-etal-2024-casablanca">
<titleInfo>
<title>Casablanca: Data and Models for Multidialectal Arabic Speech Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bashar</namePart>
<namePart type="family">Talafha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karima</namePart>
<namePart type="family">Kadaoui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samar</namePart>
<namePart type="given">Mohamed</namePart>
<namePart type="family">Magdy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mariem</namePart>
<namePart type="family">Habiboullah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chafei</namePart>
<namePart type="given">Mohamed</namePart>
<namePart type="family">Chafei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="given">Oumar</namePart>
<namePart type="family">El-Shangiti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiba</namePart>
<namePart type="family">Zayed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamedou</namePart>
<namePart type="given">Cheikh</namePart>
<namePart type="family">Tourad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahaf</namePart>
<namePart type="family">Alhamouri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rwaa</namePart>
<namePart type="family">Assi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aisha</namePart>
<namePart type="family">Alraeesi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hour</namePart>
<namePart type="family">Mohamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fakhraddin</namePart>
<namePart type="family">Alwajih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdelrahman</namePart>
<namePart type="family">Mohamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdellah</namePart>
<namePart type="family">El Mekki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">El</namePart>
<namePart type="given">Moatez</namePart>
<namePart type="given">Billah</namePart>
<namePart type="family">Nagoudi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benelhadj</namePart>
<namePart type="given">Djelloul</namePart>
<namePart type="given">Mama</namePart>
<namePart type="family">Saadia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamzah</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Alsayadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Walid</namePart>
<namePart type="family">Al-Dhabyani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Shatnawi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yasir</namePart>
<namePart type="family">Ech-chammakhy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amal</namePart>
<namePart type="family">Makouar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yousra</namePart>
<namePart type="family">Berrachedi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mustafa</namePart>
<namePart type="family">Jarrar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shady</namePart>
<namePart type="family">Shehata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ismail</namePart>
<namePart type="family">Berrada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="family">Abdul-Mageed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In spite of the recent progress in speech processing, the majority of world languages and dialects remain uncovered. This situation only furthers an already wide technological divide, thereby hindering technological and socioeconomic inclusion. This challenge is largely due to the absence of datasets that can empower diverse speech systems. In this paper, we seek to mitigate this obstacle for a number of Arabic dialects by presenting Casablanca, a large-scale community-driven effort to collect and transcribe a multi-dialectal Arabic dataset. The dataset covers eight dialects: Algerian, Egyptian, Emirati, Jordanian, Mauritanian, Moroccan, Palestinian, and Yemeni, and includes annotations for transcription, gender, dialect, and code-switching. We also develop a number of strong baselines exploiting Casablanca. The project page for Casablanca is accessible at: www.dlnlp.ai/speech/casablanca.</abstract>
<identifier type="citekey">talafha-etal-2024-casablanca</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.1211</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1211</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>21745</start>
<end>21758</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Casablanca: Data and Models for Multidialectal Arabic Speech Recognition
%A Talafha, Bashar
%A Kadaoui, Karima
%A Magdy, Samar Mohamed
%A Habiboullah, Mariem
%A Chafei, Chafei Mohamed
%A El-Shangiti, Ahmed Oumar
%A Zayed, Hiba
%A Tourad, Mohamedou Cheikh
%A Alhamouri, Rahaf
%A Assi, Rwaa
%A Alraeesi, Aisha
%A Mohamed, Hour
%A Alwajih, Fakhraddin
%A Mohamed, Abdelrahman
%A El Mekki, Abdellah
%A Nagoudi, El Moatez Billah
%A Saadia, Benelhadj Djelloul Mama
%A Alsayadi, Hamzah A.
%A Al-Dhabyani, Walid
%A Shatnawi, Sara
%A Ech-chammakhy, Yasir
%A Makouar, Amal
%A Berrachedi, Yousra
%A Jarrar, Mustafa
%A Shehata, Shady
%A Berrada, Ismail
%A Abdul-Mageed, Muhammad
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F talafha-etal-2024-casablanca
%X In spite of the recent progress in speech processing, the majority of world languages and dialects remain uncovered. This situation only furthers an already wide technological divide, thereby hindering technological and socioeconomic inclusion. This challenge is largely due to the absence of datasets that can empower diverse speech systems. In this paper, we seek to mitigate this obstacle for a number of Arabic dialects by presenting Casablanca, a large-scale community-driven effort to collect and transcribe a multi-dialectal Arabic dataset. The dataset covers eight dialects: Algerian, Egyptian, Emirati, Jordanian, Mauritanian, Moroccan, Palestinian, and Yemeni, and includes annotations for transcription, gender, dialect, and code-switching. We also develop a number of strong baselines exploiting Casablanca. The project page for Casablanca is accessible at: www.dlnlp.ai/speech/casablanca.
%R 10.18653/v1/2024.emnlp-main.1211
%U https://aclanthology.org/2024.emnlp-main.1211
%U https://doi.org/10.18653/v1/2024.emnlp-main.1211
%P 21745-21758
Markdown (Informal)
[Casablanca: Data and Models for Multidialectal Arabic Speech Recognition](https://aclanthology.org/2024.emnlp-main.1211) (Talafha et al., EMNLP 2024)
ACL
- Bashar Talafha, Karima Kadaoui, Samar Mohamed Magdy, Mariem Habiboullah, Chafei Mohamed Chafei, Ahmed Oumar El-Shangiti, Hiba Zayed, Mohamedou Cheikh Tourad, Rahaf Alhamouri, Rwaa Assi, Aisha Alraeesi, Hour Mohamed, Fakhraddin Alwajih, Abdelrahman Mohamed, Abdellah El Mekki, El Moatez Billah Nagoudi, Benelhadj Djelloul Mama Saadia, Hamzah A. Alsayadi, Walid Al-Dhabyani, et al.. 2024. Casablanca: Data and Models for Multidialectal Arabic Speech Recognition. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 21745–21758, Miami, Florida, USA. Association for Computational Linguistics.