@inproceedings{olufemi-etal-2025-challenging,
title = "Challenging Multimodal {LLM}s with {A}frican Standardized Exams: A Document {VQA} Evaluation",
author = "Olufemi, Victor Tolulope and
Babatunde, Oreoluwa Boluwatife and
Bolarinwa, Emmanuel and
Moshood, Kausar Yetunde",
editor = "Lignos, Constantine and
Abdulmumin, Idris and
Adelani, David",
booktitle = "Proceedings of the Sixth Workshop on African Natural Language Processing (AfricaNLP 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.africanlp-1.22/",
doi = "10.18653/v1/2025.africanlp-1.22",
pages = "150--157",
ISBN = "979-8-89176-257-2",
abstract = "Despite rapid advancements in multimodal large language models (MLLMs), their ability to process low-resource African languages in document-based visual question answering (VQA) tasks remains limited. This paper evaluates three state-of-the-art MLLMs{---}GPT-4o, Claude-3.5 Haiku, and Gemini-1.5 Pro{---}on WAEC/NECO standardized exam questions in Yoruba, Igbo, and Hausa. We curate a dataset of multiple-choice questions from exam images and compare model accuracies across two prompting strategies: (1) using English prompts for African language questions, and (2) using native-language prompts. While GPT-4o achieves over 90{\%} accuracy for English, performance drops below 40{\%} for African languages, highlighting severe data imbalance in model training. Notably, native-language prompting improves accuracy for most models, yet no system approaches human-level performance, which reaches over 50{\%} in Yoruba, Igbo, and Hausa. These findings emphasize the need for diverse training data, fine-tuning, and dedicated benchmarks that address the linguistic intricacies of African languages in multimodal tasks, paving the way for more equitable and effective AI systems in education."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="olufemi-etal-2025-challenging">
<titleInfo>
<title>Challenging Multimodal LLMs with African Standardized Exams: A Document VQA Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Victor</namePart>
<namePart type="given">Tolulope</namePart>
<namePart type="family">Olufemi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oreoluwa</namePart>
<namePart type="given">Boluwatife</namePart>
<namePart type="family">Babatunde</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emmanuel</namePart>
<namePart type="family">Bolarinwa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kausar</namePart>
<namePart type="given">Yetunde</namePart>
<namePart type="family">Moshood</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Workshop on African Natural Language Processing (AfricaNLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Constantine</namePart>
<namePart type="family">Lignos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Idris</namePart>
<namePart type="family">Abdulmumin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Adelani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-257-2</identifier>
</relatedItem>
<abstract>Despite rapid advancements in multimodal large language models (MLLMs), their ability to process low-resource African languages in document-based visual question answering (VQA) tasks remains limited. This paper evaluates three state-of-the-art MLLMs—GPT-4o, Claude-3.5 Haiku, and Gemini-1.5 Pro—on WAEC/NECO standardized exam questions in Yoruba, Igbo, and Hausa. We curate a dataset of multiple-choice questions from exam images and compare model accuracies across two prompting strategies: (1) using English prompts for African language questions, and (2) using native-language prompts. While GPT-4o achieves over 90% accuracy for English, performance drops below 40% for African languages, highlighting severe data imbalance in model training. Notably, native-language prompting improves accuracy for most models, yet no system approaches human-level performance, which reaches over 50% in Yoruba, Igbo, and Hausa. These findings emphasize the need for diverse training data, fine-tuning, and dedicated benchmarks that address the linguistic intricacies of African languages in multimodal tasks, paving the way for more equitable and effective AI systems in education.</abstract>
<identifier type="citekey">olufemi-etal-2025-challenging</identifier>
<identifier type="doi">10.18653/v1/2025.africanlp-1.22</identifier>
<location>
<url>https://aclanthology.org/2025.africanlp-1.22/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>150</start>
<end>157</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Challenging Multimodal LLMs with African Standardized Exams: A Document VQA Evaluation
%A Olufemi, Victor Tolulope
%A Babatunde, Oreoluwa Boluwatife
%A Bolarinwa, Emmanuel
%A Moshood, Kausar Yetunde
%Y Lignos, Constantine
%Y Abdulmumin, Idris
%Y Adelani, David
%S Proceedings of the Sixth Workshop on African Natural Language Processing (AfricaNLP 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-257-2
%F olufemi-etal-2025-challenging
%X Despite rapid advancements in multimodal large language models (MLLMs), their ability to process low-resource African languages in document-based visual question answering (VQA) tasks remains limited. This paper evaluates three state-of-the-art MLLMs—GPT-4o, Claude-3.5 Haiku, and Gemini-1.5 Pro—on WAEC/NECO standardized exam questions in Yoruba, Igbo, and Hausa. We curate a dataset of multiple-choice questions from exam images and compare model accuracies across two prompting strategies: (1) using English prompts for African language questions, and (2) using native-language prompts. While GPT-4o achieves over 90% accuracy for English, performance drops below 40% for African languages, highlighting severe data imbalance in model training. Notably, native-language prompting improves accuracy for most models, yet no system approaches human-level performance, which reaches over 50% in Yoruba, Igbo, and Hausa. These findings emphasize the need for diverse training data, fine-tuning, and dedicated benchmarks that address the linguistic intricacies of African languages in multimodal tasks, paving the way for more equitable and effective AI systems in education.
%R 10.18653/v1/2025.africanlp-1.22
%U https://aclanthology.org/2025.africanlp-1.22/
%U https://doi.org/10.18653/v1/2025.africanlp-1.22
%P 150-157
Markdown (Informal)
[Challenging Multimodal LLMs with African Standardized Exams: A Document VQA Evaluation](https://aclanthology.org/2025.africanlp-1.22/) (Olufemi et al., AfricaNLP 2025)
ACL