@inproceedings{okewunmi-etal-2025-evaluating,
title = "Evaluating Robustness of {LLM}s to Typographical Noise in {Y}or{\`u}b{\'a} {QA}",
author = "Okewunmi, Paul and
James, Favour and
Fajemila, Oluwadunsin",
editor = "Lignos, Constantine and
Abdulmumin, Idris and
Adelani, David",
booktitle = "Proceedings of the Sixth Workshop on African Natural Language Processing (AfricaNLP 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.africanlp-1.29/",
doi = "10.18653/v1/2025.africanlp-1.29",
pages = "195--202",
ISBN = "979-8-89176-257-2",
abstract = "Generative AI models are primarily accessed through chat interfaces, where user queries often contain typographical errors. While these models perform well in English, their robustness to noisy inputs in low-resource languages like Yor{\`u}b{\'a} remains underexplored. This work investigates a Yor{\`u}b{\'a} question-answering (QA) task by introducing synthetic typographical noise into clean inputs. We design a probabilistic noise injection strategy that simulates realistic human typos. In our experiments, each character in a clean sentence is independently altered, with noise levels ranging from 10{\%} to 40{\%}. We evaluate performance across three strong multilingual models using two complementary metrics: (1) a multilingual BERTScore to assess semantic similarity between outputs on clean and noisy inputs, and (2) an LLM-as-judge approach, where the best Yor{\`u}b{\'a}-capable model rates fluency, comprehension, and accuracy on a 1{--}5 scale. Results show that while English QA performance degrades gradually, Yor{\`u}b{\'a} QA suffers a sharper decline. At 40{\%} noise, GPT-4o experiences over a 50{\%} drop in comprehension ability, with similar declines for Gemini 2.0 Flash and Claude 3.7 Sonnet. We conclude with recommendations for noise-aware training and dedicated noisy Yor{\`u}b{\'a} benchmarks to enhance LLM robustness in low-resource settings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="okewunmi-etal-2025-evaluating">
<titleInfo>
<title>Evaluating Robustness of LLMs to Typographical Noise in Yorùbá QA</title>
</titleInfo>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Okewunmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Favour</namePart>
<namePart type="family">James</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oluwadunsin</namePart>
<namePart type="family">Fajemila</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Workshop on African Natural Language Processing (AfricaNLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Constantine</namePart>
<namePart type="family">Lignos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Idris</namePart>
<namePart type="family">Abdulmumin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Adelani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-257-2</identifier>
</relatedItem>
<abstract>Generative AI models are primarily accessed through chat interfaces, where user queries often contain typographical errors. While these models perform well in English, their robustness to noisy inputs in low-resource languages like Yorùbá remains underexplored. This work investigates a Yorùbá question-answering (QA) task by introducing synthetic typographical noise into clean inputs. We design a probabilistic noise injection strategy that simulates realistic human typos. In our experiments, each character in a clean sentence is independently altered, with noise levels ranging from 10% to 40%. We evaluate performance across three strong multilingual models using two complementary metrics: (1) a multilingual BERTScore to assess semantic similarity between outputs on clean and noisy inputs, and (2) an LLM-as-judge approach, where the best Yorùbá-capable model rates fluency, comprehension, and accuracy on a 1–5 scale. Results show that while English QA performance degrades gradually, Yorùbá QA suffers a sharper decline. At 40% noise, GPT-4o experiences over a 50% drop in comprehension ability, with similar declines for Gemini 2.0 Flash and Claude 3.7 Sonnet. We conclude with recommendations for noise-aware training and dedicated noisy Yorùbá benchmarks to enhance LLM robustness in low-resource settings.</abstract>
<identifier type="citekey">okewunmi-etal-2025-evaluating</identifier>
<identifier type="doi">10.18653/v1/2025.africanlp-1.29</identifier>
<location>
<url>https://aclanthology.org/2025.africanlp-1.29/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>195</start>
<end>202</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Robustness of LLMs to Typographical Noise in Yorùbá QA
%A Okewunmi, Paul
%A James, Favour
%A Fajemila, Oluwadunsin
%Y Lignos, Constantine
%Y Abdulmumin, Idris
%Y Adelani, David
%S Proceedings of the Sixth Workshop on African Natural Language Processing (AfricaNLP 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-257-2
%F okewunmi-etal-2025-evaluating
%X Generative AI models are primarily accessed through chat interfaces, where user queries often contain typographical errors. While these models perform well in English, their robustness to noisy inputs in low-resource languages like Yorùbá remains underexplored. This work investigates a Yorùbá question-answering (QA) task by introducing synthetic typographical noise into clean inputs. We design a probabilistic noise injection strategy that simulates realistic human typos. In our experiments, each character in a clean sentence is independently altered, with noise levels ranging from 10% to 40%. We evaluate performance across three strong multilingual models using two complementary metrics: (1) a multilingual BERTScore to assess semantic similarity between outputs on clean and noisy inputs, and (2) an LLM-as-judge approach, where the best Yorùbá-capable model rates fluency, comprehension, and accuracy on a 1–5 scale. Results show that while English QA performance degrades gradually, Yorùbá QA suffers a sharper decline. At 40% noise, GPT-4o experiences over a 50% drop in comprehension ability, with similar declines for Gemini 2.0 Flash and Claude 3.7 Sonnet. We conclude with recommendations for noise-aware training and dedicated noisy Yorùbá benchmarks to enhance LLM robustness in low-resource settings.
%R 10.18653/v1/2025.africanlp-1.29
%U https://aclanthology.org/2025.africanlp-1.29/
%U https://doi.org/10.18653/v1/2025.africanlp-1.29
%P 195-202
Markdown (Informal)
[Evaluating Robustness of LLMs to Typographical Noise in Yorùbá QA](https://aclanthology.org/2025.africanlp-1.29/) (Okewunmi et al., AfricaNLP 2025)
ACL