@inproceedings{leventhal-etal-2026-dealing,
title = "Dealing with the Hard Facts of Low-Resource {A}frican {NLP}",
author = "Leventhal, Michael and
Diarra, Yacouba and
Coulibaly, Nouhoum and
Kamat{\'e}, Panga Azazia and
Demb{\'e}l{\'e}, Aymane and
Tall, Madani Amadou and
Kone, Emmanuel Elise",
editor = "Chimoto, Everlyn Asiko and
Lignos, Constantine and
Muhammad, Shamsuddeen and
Abdulmumin, Idris and
Siro, Clemencia and
Adelani, David Ifeoluwa",
booktitle = "Proceedings of the 7th Workshop on {A}frican Natural Language Processing ({A}frica{NLP} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.africanlp-main.1/",
pages = "1--10",
ISBN = "979-8-89176-364-7",
abstract = "Creating speech datasets, models, and evaluation frameworks for low-resource languages remains challenging given the lack of a broad base of pertinent experience to draw from. This paper reports on the field collection of 612 hours of spontaneous speech in Bambara, a low-resource West African language; the semi-automated annotation of that dataset with transcriptions; the creation of several monolingual ultra-compact and small models using the dataset; and the automatic and human evaluation of their output. We offer practical suggestions for data collection protocols, annotation, and model design, as well as evidence for the importance of performing human evaluation. In addition to the main dataset, multiple evaluation datasets, models, and code are made publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="leventhal-etal-2026-dealing">
<titleInfo>
<title>Dealing with the Hard Facts of Low-Resource African NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Leventhal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yacouba</namePart>
<namePart type="family">Diarra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nouhoum</namePart>
<namePart type="family">Coulibaly</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Panga</namePart>
<namePart type="given">Azazia</namePart>
<namePart type="family">Kamaté</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aymane</namePart>
<namePart type="family">Dembélé</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Madani</namePart>
<namePart type="given">Amadou</namePart>
<namePart type="family">Tall</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emmanuel</namePart>
<namePart type="given">Elise</namePart>
<namePart type="family">Kone</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Workshop on African Natural Language Processing (AfricaNLP 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Everlyn</namePart>
<namePart type="given">Asiko</namePart>
<namePart type="family">Chimoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Constantine</namePart>
<namePart type="family">Lignos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shamsuddeen</namePart>
<namePart type="family">Muhammad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Idris</namePart>
<namePart type="family">Abdulmumin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clemencia</namePart>
<namePart type="family">Siro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">Ifeoluwa</namePart>
<namePart type="family">Adelani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-364-7</identifier>
</relatedItem>
<abstract>Creating speech datasets, models, and evaluation frameworks for low-resource languages remains challenging given the lack of a broad base of pertinent experience to draw from. This paper reports on the field collection of 612 hours of spontaneous speech in Bambara, a low-resource West African language; the semi-automated annotation of that dataset with transcriptions; the creation of several monolingual ultra-compact and small models using the dataset; and the automatic and human evaluation of their output. We offer practical suggestions for data collection protocols, annotation, and model design, as well as evidence for the importance of performing human evaluation. In addition to the main dataset, multiple evaluation datasets, models, and code are made publicly available.</abstract>
<identifier type="citekey">leventhal-etal-2026-dealing</identifier>
<location>
<url>https://aclanthology.org/2026.africanlp-main.1/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>1</start>
<end>10</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dealing with the Hard Facts of Low-Resource African NLP
%A Leventhal, Michael
%A Diarra, Yacouba
%A Coulibaly, Nouhoum
%A Kamaté, Panga Azazia
%A Dembélé, Aymane
%A Tall, Madani Amadou
%A Kone, Emmanuel Elise
%Y Chimoto, Everlyn Asiko
%Y Lignos, Constantine
%Y Muhammad, Shamsuddeen
%Y Abdulmumin, Idris
%Y Siro, Clemencia
%Y Adelani, David Ifeoluwa
%S Proceedings of the 7th Workshop on African Natural Language Processing (AfricaNLP 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-364-7
%F leventhal-etal-2026-dealing
%X Creating speech datasets, models, and evaluation frameworks for low-resource languages remains challenging given the lack of a broad base of pertinent experience to draw from. This paper reports on the field collection of 612 hours of spontaneous speech in Bambara, a low-resource West African language; the semi-automated annotation of that dataset with transcriptions; the creation of several monolingual ultra-compact and small models using the dataset; and the automatic and human evaluation of their output. We offer practical suggestions for data collection protocols, annotation, and model design, as well as evidence for the importance of performing human evaluation. In addition to the main dataset, multiple evaluation datasets, models, and code are made publicly available.
%U https://aclanthology.org/2026.africanlp-main.1/
%P 1-10
Markdown (Informal)
[Dealing with the Hard Facts of Low-Resource African NLP](https://aclanthology.org/2026.africanlp-main.1/) (Leventhal et al., AfricaNLP 2026)
ACL
- Michael Leventhal, Yacouba Diarra, Nouhoum Coulibaly, Panga Azazia Kamaté, Aymane Dembélé, Madani Amadou Tall, and Emmanuel Elise Kone. 2026. Dealing with the Hard Facts of Low-Resource African NLP. In Proceedings of the 7th Workshop on African Natural Language Processing (AfricaNLP 2026), pages 1–10, Rabat, Morocco. Association for Computational Linguistics.