@inproceedings{siminyu-etal-2025-tolerance,
title = "On the Tolerance of Repetition Before Performance Degradation in Kiswahili Automatic Speech Recognition",
author = "Siminyu, Kathleen and
Reid, Kathy and
Ryakitimbo, Rebecca and
Mwasaru, Britone and
Chair, Chenai",
editor = "Lignos, Constantine and
Abdulmumin, Idris and
Adelani, David",
booktitle = "Proceedings of the Sixth Workshop on African Natural Language Processing (AfricaNLP 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.africanlp-1.3/",
doi = "10.18653/v1/2025.africanlp-1.3",
pages = "15--23",
ISBN = "979-8-89176-257-2",
abstract = "State of the art end-to-end automatic speech recognition (ASR) models require large speech datasets for training. The Mozilla Common Voice project crowd-sources read speech to address this need. However, this approach often results in many audio utterances being recorded for each written sentence. Using Kiswahili speech data, this paper first explores how much audio repetition in utterances is permissible in a training set before model degradation occurs, then examines the extent to which audio augmentation techniques can be employed to increase the diversity of speech characteristics and improve accuracy. We find that repetition up to a ratio of 1 sentence to 8 audio recordings improves performance, but performance degrades at a ratio of 1:16. We also find small improvements from frequency mask, time mask and tempo augmentation. Our findings provide guidance on training set construction for ASR practitioners, particularly those working in under-served languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="siminyu-etal-2025-tolerance">
<titleInfo>
<title>On the Tolerance of Repetition Before Performance Degradation in Kiswahili Automatic Speech Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kathleen</namePart>
<namePart type="family">Siminyu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kathy</namePart>
<namePart type="family">Reid</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rebecca</namePart>
<namePart type="family">Ryakitimbo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Britone</namePart>
<namePart type="family">Mwasaru</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenai</namePart>
<namePart type="family">Chair</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Workshop on African Natural Language Processing (AfricaNLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Constantine</namePart>
<namePart type="family">Lignos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Idris</namePart>
<namePart type="family">Abdulmumin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Adelani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-257-2</identifier>
</relatedItem>
<abstract>State of the art end-to-end automatic speech recognition (ASR) models require large speech datasets for training. The Mozilla Common Voice project crowd-sources read speech to address this need. However, this approach often results in many audio utterances being recorded for each written sentence. Using Kiswahili speech data, this paper first explores how much audio repetition in utterances is permissible in a training set before model degradation occurs, then examines the extent to which audio augmentation techniques can be employed to increase the diversity of speech characteristics and improve accuracy. We find that repetition up to a ratio of 1 sentence to 8 audio recordings improves performance, but performance degrades at a ratio of 1:16. We also find small improvements from frequency mask, time mask and tempo augmentation. Our findings provide guidance on training set construction for ASR practitioners, particularly those working in under-served languages.</abstract>
<identifier type="citekey">siminyu-etal-2025-tolerance</identifier>
<identifier type="doi">10.18653/v1/2025.africanlp-1.3</identifier>
<location>
<url>https://aclanthology.org/2025.africanlp-1.3/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>15</start>
<end>23</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Tolerance of Repetition Before Performance Degradation in Kiswahili Automatic Speech Recognition
%A Siminyu, Kathleen
%A Reid, Kathy
%A Ryakitimbo, Rebecca
%A Mwasaru, Britone
%A Chair, Chenai
%Y Lignos, Constantine
%Y Abdulmumin, Idris
%Y Adelani, David
%S Proceedings of the Sixth Workshop on African Natural Language Processing (AfricaNLP 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-257-2
%F siminyu-etal-2025-tolerance
%X State of the art end-to-end automatic speech recognition (ASR) models require large speech datasets for training. The Mozilla Common Voice project crowd-sources read speech to address this need. However, this approach often results in many audio utterances being recorded for each written sentence. Using Kiswahili speech data, this paper first explores how much audio repetition in utterances is permissible in a training set before model degradation occurs, then examines the extent to which audio augmentation techniques can be employed to increase the diversity of speech characteristics and improve accuracy. We find that repetition up to a ratio of 1 sentence to 8 audio recordings improves performance, but performance degrades at a ratio of 1:16. We also find small improvements from frequency mask, time mask and tempo augmentation. Our findings provide guidance on training set construction for ASR practitioners, particularly those working in under-served languages.
%R 10.18653/v1/2025.africanlp-1.3
%U https://aclanthology.org/2025.africanlp-1.3/
%U https://doi.org/10.18653/v1/2025.africanlp-1.3
%P 15-23
Markdown (Informal)
[On the Tolerance of Repetition Before Performance Degradation in Kiswahili Automatic Speech Recognition](https://aclanthology.org/2025.africanlp-1.3/) (Siminyu et al., AfricaNLP 2025)
ACL