@inproceedings{olusanya-2026-tone,
title = "Tone in {Y}oruba {ASR}: Evaluating the Impact of Tone Recognition on Transformer-Based {ASR} Models",
author = "Olusanya, Joy",
editor = "Hettiarachchi, Hansi and
Ranasinghe, Tharindu and
Plum, Alistair and
Rayson, Paul and
Mitkov, Ruslan and
Gaber, Mohamed and
Premasiri, Damith and
Tan, Fiona Anting and
Uyangodage, Lasitha",
booktitle = "Proceedings of the Second Workshop on Language Models for Low-Resource Languages ({L}o{R}es{LM} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.loreslm-1.14/",
pages = "149--156",
ISBN = "979-8-89176-377-7",
abstract = "This research investigates the role of tone in Standard Yoruba Automatic Speech Recognition (ASR), focusing on how explicit tone marking (diacritics) influences accuracy and overall system performance. As a low-resource tonal language, Yoruba encodes critical lexical and grammatical contrasts via pitch, making tone handling both essential and challenging for ASR. Three pre-trained models, Meta{'}s MMS-1B-all, OpenAI{'}s Whisper-small, and AstralZander/Yoruba{\_}ASR, were trained and evaluated on datasets that vary by tone annotation (fully tone-marked vs. non-tone-marked). Using Word Error Rate (WER) and Tone Error Rate (TER) as primary metrics, results consistently favored non-tone-marked data, yielding substantially lower error rates than their tone-marked counterparts. These outcomes suggest that current architectures encounter difficulties with diacritically marked Yoruba, likely stemming from tokenization behavior, insufficient representation of tonal cues, and limited tone modeling in the underlying pre-training. The study concludes that tone-aware approaches, spanning tokenization, acoustic-text alignment, and model objectives, are necessary to improve recognition for Yoruba and other low-resource tonal languages. The findings clarify the interaction between linguistic tone systems and computational modeling, and offer concrete directions for building more robust, tone-sensitive ASR systems."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="olusanya-2026-tone">
<titleInfo>
<title>Tone in Yoruba ASR: Evaluating the Impact of Tone Recognition on Transformer-Based ASR Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joy</namePart>
<namePart type="family">Olusanya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hansi</namePart>
<namePart type="family">Hettiarachchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Ranasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alistair</namePart>
<namePart type="family">Plum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Gaber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Damith</namePart>
<namePart type="family">Premasiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fiona</namePart>
<namePart type="given">Anting</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lasitha</namePart>
<namePart type="family">Uyangodage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-377-7</identifier>
</relatedItem>
<abstract>This research investigates the role of tone in Standard Yoruba Automatic Speech Recognition (ASR), focusing on how explicit tone marking (diacritics) influences accuracy and overall system performance. As a low-resource tonal language, Yoruba encodes critical lexical and grammatical contrasts via pitch, making tone handling both essential and challenging for ASR. Three pre-trained models, Meta’s MMS-1B-all, OpenAI’s Whisper-small, and AstralZander/Yoruba_ASR, were trained and evaluated on datasets that vary by tone annotation (fully tone-marked vs. non-tone-marked). Using Word Error Rate (WER) and Tone Error Rate (TER) as primary metrics, results consistently favored non-tone-marked data, yielding substantially lower error rates than their tone-marked counterparts. These outcomes suggest that current architectures encounter difficulties with diacritically marked Yoruba, likely stemming from tokenization behavior, insufficient representation of tonal cues, and limited tone modeling in the underlying pre-training. The study concludes that tone-aware approaches, spanning tokenization, acoustic-text alignment, and model objectives, are necessary to improve recognition for Yoruba and other low-resource tonal languages. The findings clarify the interaction between linguistic tone systems and computational modeling, and offer concrete directions for building more robust, tone-sensitive ASR systems.</abstract>
<identifier type="citekey">olusanya-2026-tone</identifier>
<location>
<url>https://aclanthology.org/2026.loreslm-1.14/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>149</start>
<end>156</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tone in Yoruba ASR: Evaluating the Impact of Tone Recognition on Transformer-Based ASR Models
%A Olusanya, Joy
%Y Hettiarachchi, Hansi
%Y Ranasinghe, Tharindu
%Y Plum, Alistair
%Y Rayson, Paul
%Y Mitkov, Ruslan
%Y Gaber, Mohamed
%Y Premasiri, Damith
%Y Tan, Fiona Anting
%Y Uyangodage, Lasitha
%S Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-377-7
%F olusanya-2026-tone
%X This research investigates the role of tone in Standard Yoruba Automatic Speech Recognition (ASR), focusing on how explicit tone marking (diacritics) influences accuracy and overall system performance. As a low-resource tonal language, Yoruba encodes critical lexical and grammatical contrasts via pitch, making tone handling both essential and challenging for ASR. Three pre-trained models, Meta’s MMS-1B-all, OpenAI’s Whisper-small, and AstralZander/Yoruba_ASR, were trained and evaluated on datasets that vary by tone annotation (fully tone-marked vs. non-tone-marked). Using Word Error Rate (WER) and Tone Error Rate (TER) as primary metrics, results consistently favored non-tone-marked data, yielding substantially lower error rates than their tone-marked counterparts. These outcomes suggest that current architectures encounter difficulties with diacritically marked Yoruba, likely stemming from tokenization behavior, insufficient representation of tonal cues, and limited tone modeling in the underlying pre-training. The study concludes that tone-aware approaches, spanning tokenization, acoustic-text alignment, and model objectives, are necessary to improve recognition for Yoruba and other low-resource tonal languages. The findings clarify the interaction between linguistic tone systems and computational modeling, and offer concrete directions for building more robust, tone-sensitive ASR systems.
%U https://aclanthology.org/2026.loreslm-1.14/
%P 149-156
Markdown (Informal)
[Tone in Yoruba ASR: Evaluating the Impact of Tone Recognition on Transformer-Based ASR Models](https://aclanthology.org/2026.loreslm-1.14/) (Olusanya, LoResLM 2026)
ACL