@inproceedings{ahia-etal-2024-voices,
title = "Voices Unheard: {NLP} Resources and Models for {Y}or{\`u}b{\'a} Regional Dialects",
author = "Ahia, Orevaoghene and
Aremu, Anuoluwapo and
Abagyan, Diana and
Gonen, Hila and
Adelani, David Ifeoluwa and
Abolade, Daud and
Smith, Noah A. and
Tsvetkov, Yulia",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.251",
doi = "10.18653/v1/2024.emnlp-main.251",
pages = "4392--4409",
abstract = "Yoruba{---}an African language with roughly 47 million speakers{---}encompasses a continuum with several dialects. Recent efforts to develop NLP technologies for African languages have focused on their standard dialects, resulting in disparities for dialects and varieties for which there are little to no resources or tools. We take steps towards bridging this gap by introducing a new high-quality parallel text and speech corpus; YORULECT across three domains and four regional yoruba dialects. To develop this corpus, we engaged native speakers, traveling to communities where these dialects are spoken, to collect text and speech data. Using our newly created corpus, we conducted extensive experiments on (text) machine translation, automatic speech recognition, and speech-to-text translation. Our results reveal substantial performance disparities between standard yoruba and the other dialects across all tasks. However, we also show that with dialect-adaptive finetuning, we are able to narrow this gap. We believe our dataset and experimental analysis will contribute greatly to developing NLP tools for Yoruba and its dialects, and potentially for other African languages, by improving our understanding of existing challenges and offering a high-quality dataset for further development. We will release YORULECT dataset and models publicly under an open license.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ahia-etal-2024-voices">
<titleInfo>
<title>Voices Unheard: NLP Resources and Models for Yorùbá Regional Dialects</title>
</titleInfo>
<name type="personal">
<namePart type="given">Orevaoghene</namePart>
<namePart type="family">Ahia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anuoluwapo</namePart>
<namePart type="family">Aremu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Abagyan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hila</namePart>
<namePart type="family">Gonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">Ifeoluwa</namePart>
<namePart type="family">Adelani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daud</namePart>
<namePart type="family">Abolade</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noah</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Smith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulia</namePart>
<namePart type="family">Tsvetkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Yoruba—an African language with roughly 47 million speakers—encompasses a continuum with several dialects. Recent efforts to develop NLP technologies for African languages have focused on their standard dialects, resulting in disparities for dialects and varieties for which there are little to no resources or tools. We take steps towards bridging this gap by introducing a new high-quality parallel text and speech corpus; YORULECT across three domains and four regional yoruba dialects. To develop this corpus, we engaged native speakers, traveling to communities where these dialects are spoken, to collect text and speech data. Using our newly created corpus, we conducted extensive experiments on (text) machine translation, automatic speech recognition, and speech-to-text translation. Our results reveal substantial performance disparities between standard yoruba and the other dialects across all tasks. However, we also show that with dialect-adaptive finetuning, we are able to narrow this gap. We believe our dataset and experimental analysis will contribute greatly to developing NLP tools for Yoruba and its dialects, and potentially for other African languages, by improving our understanding of existing challenges and offering a high-quality dataset for further development. We will release YORULECT dataset and models publicly under an open license.</abstract>
<identifier type="citekey">ahia-etal-2024-voices</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.251</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.251</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>4392</start>
<end>4409</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Voices Unheard: NLP Resources and Models for Yorùbá Regional Dialects
%A Ahia, Orevaoghene
%A Aremu, Anuoluwapo
%A Abagyan, Diana
%A Gonen, Hila
%A Adelani, David Ifeoluwa
%A Abolade, Daud
%A Smith, Noah A.
%A Tsvetkov, Yulia
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F ahia-etal-2024-voices
%X Yoruba—an African language with roughly 47 million speakers—encompasses a continuum with several dialects. Recent efforts to develop NLP technologies for African languages have focused on their standard dialects, resulting in disparities for dialects and varieties for which there are little to no resources or tools. We take steps towards bridging this gap by introducing a new high-quality parallel text and speech corpus; YORULECT across three domains and four regional yoruba dialects. To develop this corpus, we engaged native speakers, traveling to communities where these dialects are spoken, to collect text and speech data. Using our newly created corpus, we conducted extensive experiments on (text) machine translation, automatic speech recognition, and speech-to-text translation. Our results reveal substantial performance disparities between standard yoruba and the other dialects across all tasks. However, we also show that with dialect-adaptive finetuning, we are able to narrow this gap. We believe our dataset and experimental analysis will contribute greatly to developing NLP tools for Yoruba and its dialects, and potentially for other African languages, by improving our understanding of existing challenges and offering a high-quality dataset for further development. We will release YORULECT dataset and models publicly under an open license.
%R 10.18653/v1/2024.emnlp-main.251
%U https://aclanthology.org/2024.emnlp-main.251
%U https://doi.org/10.18653/v1/2024.emnlp-main.251
%P 4392-4409
Markdown (Informal)
[Voices Unheard: NLP Resources and Models for Yorùbá Regional Dialects](https://aclanthology.org/2024.emnlp-main.251) (Ahia et al., EMNLP 2024)
ACL
- Orevaoghene Ahia, Anuoluwapo Aremu, Diana Abagyan, Hila Gonen, David Ifeoluwa Adelani, Daud Abolade, Noah A. Smith, and Yulia Tsvetkov. 2024. Voices Unheard: NLP Resources and Models for Yorùbá Regional Dialects. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 4392–4409, Miami, Florida, USA. Association for Computational Linguistics.