@inproceedings{sharma-etal-2025-fine,
title = "Fine-tuning Whisper Tiny for {S}wahili {ASR}: Challenges and Recommendations for Low-Resource Speech Recognition",
author = "Sharma, Avinash Kumar and
Pandya, Manas and
Shukla, Arpit",
editor = "Lignos, Constantine and
Abdulmumin, Idris and
Adelani, David",
booktitle = "Proceedings of the Sixth Workshop on African Natural Language Processing (AfricaNLP 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.africanlp-1.11/",
doi = "10.18653/v1/2025.africanlp-1.11",
pages = "74--81",
ISBN = "979-8-89176-257-2",
abstract = "Automatic Speech Recognition (ASR) technologies have seen significant advancements, yet many widely spoken languages remain underrepresented. This paper explores the fine-tuning of OpenAI{'}s Whisper Tiny model (39M parameters) for Swahili, a lingua franca for over 100 million people across East Africa. Using a dataset of 5,520 Swahili audio samples, we analyze the model{'}s performance, error patterns, and limitations after fine-tuning. Our results demonstrate the potential of fine-tuning for improving transcription accuracy, while also highlighting persistent challenges such as phonetic misinterpretations, named entity recognition failures, and difficulties with morphologically complex words. We provide recommendations for improving Swahili ASR, including scaling to larger model variants, architectural adaptations for agglutinative languages, and data enhancement strategies. This work contributes to the growing body of research on adapting pre-trained multilingual ASR systems to low-resource languages, emphasizing the need for approaches that account for the unique linguistic features of Bantu languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sharma-etal-2025-fine">
<titleInfo>
<title>Fine-tuning Whisper Tiny for Swahili ASR: Challenges and Recommendations for Low-Resource Speech Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Avinash</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manas</namePart>
<namePart type="family">Pandya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arpit</namePart>
<namePart type="family">Shukla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Workshop on African Natural Language Processing (AfricaNLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Constantine</namePart>
<namePart type="family">Lignos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Idris</namePart>
<namePart type="family">Abdulmumin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Adelani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-257-2</identifier>
</relatedItem>
<abstract>Automatic Speech Recognition (ASR) technologies have seen significant advancements, yet many widely spoken languages remain underrepresented. This paper explores the fine-tuning of OpenAI’s Whisper Tiny model (39M parameters) for Swahili, a lingua franca for over 100 million people across East Africa. Using a dataset of 5,520 Swahili audio samples, we analyze the model’s performance, error patterns, and limitations after fine-tuning. Our results demonstrate the potential of fine-tuning for improving transcription accuracy, while also highlighting persistent challenges such as phonetic misinterpretations, named entity recognition failures, and difficulties with morphologically complex words. We provide recommendations for improving Swahili ASR, including scaling to larger model variants, architectural adaptations for agglutinative languages, and data enhancement strategies. This work contributes to the growing body of research on adapting pre-trained multilingual ASR systems to low-resource languages, emphasizing the need for approaches that account for the unique linguistic features of Bantu languages.</abstract>
<identifier type="citekey">sharma-etal-2025-fine</identifier>
<identifier type="doi">10.18653/v1/2025.africanlp-1.11</identifier>
<location>
<url>https://aclanthology.org/2025.africanlp-1.11/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>74</start>
<end>81</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Fine-tuning Whisper Tiny for Swahili ASR: Challenges and Recommendations for Low-Resource Speech Recognition
%A Sharma, Avinash Kumar
%A Pandya, Manas
%A Shukla, Arpit
%Y Lignos, Constantine
%Y Abdulmumin, Idris
%Y Adelani, David
%S Proceedings of the Sixth Workshop on African Natural Language Processing (AfricaNLP 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-257-2
%F sharma-etal-2025-fine
%X Automatic Speech Recognition (ASR) technologies have seen significant advancements, yet many widely spoken languages remain underrepresented. This paper explores the fine-tuning of OpenAI’s Whisper Tiny model (39M parameters) for Swahili, a lingua franca for over 100 million people across East Africa. Using a dataset of 5,520 Swahili audio samples, we analyze the model’s performance, error patterns, and limitations after fine-tuning. Our results demonstrate the potential of fine-tuning for improving transcription accuracy, while also highlighting persistent challenges such as phonetic misinterpretations, named entity recognition failures, and difficulties with morphologically complex words. We provide recommendations for improving Swahili ASR, including scaling to larger model variants, architectural adaptations for agglutinative languages, and data enhancement strategies. This work contributes to the growing body of research on adapting pre-trained multilingual ASR systems to low-resource languages, emphasizing the need for approaches that account for the unique linguistic features of Bantu languages.
%R 10.18653/v1/2025.africanlp-1.11
%U https://aclanthology.org/2025.africanlp-1.11/
%U https://doi.org/10.18653/v1/2025.africanlp-1.11
%P 74-81
Markdown (Informal)
[Fine-tuning Whisper Tiny for Swahili ASR: Challenges and Recommendations for Low-Resource Speech Recognition](https://aclanthology.org/2025.africanlp-1.11/) (Sharma et al., AfricaNLP 2025)
ACL