@inproceedings{tapo-etal-2025-bayelemabaga,
title = "Bayelemabaga: Creating Resources for {B}ambara {NLP}",
author = "Tapo, Allahsera Auguste and
Assogba, Kevin and
Homan, Christopher M and
Rafique, M. Mustafa and
Zampieri, Marcos",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.602/",
doi = "10.18653/v1/2025.naacl-long.602",
pages = "12060--12070",
ISBN = "979-8-89176-189-6",
abstract = "Data curation for under-resource languages enables the development of more accurate and culturally sensitive natural language processing models. However, the scarcity of well-structured multilingual datasets remains a challenge for advancing machine translation in these languages, especially for African languages. This paper focuses on creating high-quality parallel corpora that capture linguistic diversity to address this gap. We introduce Bayelemabaga, the most extensive curated multilingual dataset for machine translation in the Bambara language, the vehicular language of Mali. The dataset consists of 47K Bambara-French parallel sentences curated from 231 data sources, including short stories, formal documents, and religious literature, combining modern, historical, and indigenous languages. We present our data curation process and analyze its impact on neural machine translation by fine-tuning seven commonly used transformer-based language models, i.e., MBART, MT5, M2M-100, NLLB-200, Mistral-7B, Open-Llama-7B, and Meta-Llama3-8B on Bayelemabaga. Our evaluation on four Bambara-French language pair datasets (three existing datasets and the test set of Bayelemabaga) show up to $+4.5$, $+11.4$, and $+0.27$ in gains, respectively, on BLEU, CHRF++, and AfriCOMET evaluation metrics. We also conducted machine and human evaluations of translations from studied models to compare the machine translation quality of encoder-decoder and decoder-only models. Our results indicate that encoder-decoder models remain the best, highlighting the importance of additional datasets to train decoder-only models."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tapo-etal-2025-bayelemabaga">
<titleInfo>
<title>Bayelemabaga: Creating Resources for Bambara NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Allahsera</namePart>
<namePart type="given">Auguste</namePart>
<namePart type="family">Tapo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Assogba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Homan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">M</namePart>
<namePart type="given">Mustafa</namePart>
<namePart type="family">Rafique</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Data curation for under-resource languages enables the development of more accurate and culturally sensitive natural language processing models. However, the scarcity of well-structured multilingual datasets remains a challenge for advancing machine translation in these languages, especially for African languages. This paper focuses on creating high-quality parallel corpora that capture linguistic diversity to address this gap. We introduce Bayelemabaga, the most extensive curated multilingual dataset for machine translation in the Bambara language, the vehicular language of Mali. The dataset consists of 47K Bambara-French parallel sentences curated from 231 data sources, including short stories, formal documents, and religious literature, combining modern, historical, and indigenous languages. We present our data curation process and analyze its impact on neural machine translation by fine-tuning seven commonly used transformer-based language models, i.e., MBART, MT5, M2M-100, NLLB-200, Mistral-7B, Open-Llama-7B, and Meta-Llama3-8B on Bayelemabaga. Our evaluation on four Bambara-French language pair datasets (three existing datasets and the test set of Bayelemabaga) show up to +4.5, +11.4, and +0.27 in gains, respectively, on BLEU, CHRF++, and AfriCOMET evaluation metrics. We also conducted machine and human evaluations of translations from studied models to compare the machine translation quality of encoder-decoder and decoder-only models. Our results indicate that encoder-decoder models remain the best, highlighting the importance of additional datasets to train decoder-only models.</abstract>
<identifier type="citekey">tapo-etal-2025-bayelemabaga</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.602</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.602/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>12060</start>
<end>12070</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bayelemabaga: Creating Resources for Bambara NLP
%A Tapo, Allahsera Auguste
%A Assogba, Kevin
%A Homan, Christopher M.
%A Rafique, M. Mustafa
%A Zampieri, Marcos
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F tapo-etal-2025-bayelemabaga
%X Data curation for under-resource languages enables the development of more accurate and culturally sensitive natural language processing models. However, the scarcity of well-structured multilingual datasets remains a challenge for advancing machine translation in these languages, especially for African languages. This paper focuses on creating high-quality parallel corpora that capture linguistic diversity to address this gap. We introduce Bayelemabaga, the most extensive curated multilingual dataset for machine translation in the Bambara language, the vehicular language of Mali. The dataset consists of 47K Bambara-French parallel sentences curated from 231 data sources, including short stories, formal documents, and religious literature, combining modern, historical, and indigenous languages. We present our data curation process and analyze its impact on neural machine translation by fine-tuning seven commonly used transformer-based language models, i.e., MBART, MT5, M2M-100, NLLB-200, Mistral-7B, Open-Llama-7B, and Meta-Llama3-8B on Bayelemabaga. Our evaluation on four Bambara-French language pair datasets (three existing datasets and the test set of Bayelemabaga) show up to +4.5, +11.4, and +0.27 in gains, respectively, on BLEU, CHRF++, and AfriCOMET evaluation metrics. We also conducted machine and human evaluations of translations from studied models to compare the machine translation quality of encoder-decoder and decoder-only models. Our results indicate that encoder-decoder models remain the best, highlighting the importance of additional datasets to train decoder-only models.
%R 10.18653/v1/2025.naacl-long.602
%U https://aclanthology.org/2025.naacl-long.602/
%U https://doi.org/10.18653/v1/2025.naacl-long.602
%P 12060-12070
Markdown (Informal)
[Bayelemabaga: Creating Resources for Bambara NLP](https://aclanthology.org/2025.naacl-long.602/) (Tapo et al., NAACL 2025)
ACL
- Allahsera Auguste Tapo, Kevin Assogba, Christopher M Homan, M. Mustafa Rafique, and Marcos Zampieri. 2025. Bayelemabaga: Creating Resources for Bambara NLP. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 12060–12070, Albuquerque, New Mexico. Association for Computational Linguistics.