@inproceedings{kazi-etal-2025-crossing,
title = "Crossing Language Boundaries: Evaluation of Large Language Models on {U}rdu-{E}nglish Question Answering",
author = "Kazi, Samreen and
Rahim, Maria and
Khoja, Shakeel Ahmed",
editor = "Weerasinghe, Ruvan and
Anuradha, Isuri and
Sumanathilaka, Deshan",
booktitle = "Proceedings of the First Workshop on Natural Language Processing for Indo-Aryan and Dravidian Languages",
month = jan,
year = "2025",
address = "Abu Dhabi",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.indonlp-1.17/",
pages = "141--151",
abstract = "This study evaluates the question-answering capabilities of Large Language Models (LLMs) in Urdu, addressing a critical gap in low-resource language processing. Four models GPT-4, mBERT, XLM-R, and mT5 are assessed across monolingual, cross-lingual, and mixed-language settings using the UQuAD1.0 and SQuAD2.0 datasets. Results reveal significant performance gaps between English and Urdu processing, with GPT-4 achieving the highest F1 scores (89.1{\%} in English, 76.4{\%} in Urdu) while demonstrating relative robustness in cross-lingual scenarios. Boundary detection and translation mismatches emerge as primary challenges, particularly in cross-lingual settings. The study further demonstrates that question complexity and length significantly impact performance, with factoid questions yielding 14.2{\%} higher F1 scores compared to complex questions. These findings establish important benchmarks for enhancing LLM performance in low-resource languages and identify key areas for improvement in multilingual question-answering systems."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kazi-etal-2025-crossing">
<titleInfo>
<title>Crossing Language Boundaries: Evaluation of Large Language Models on Urdu-English Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Samreen</namePart>
<namePart type="family">Kazi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Rahim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shakeel</namePart>
<namePart type="given">Ahmed</namePart>
<namePart type="family">Khoja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Natural Language Processing for Indo-Aryan and Dravidian Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruvan</namePart>
<namePart type="family">Weerasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isuri</namePart>
<namePart type="family">Anuradha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deshan</namePart>
<namePart type="family">Sumanathilaka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This study evaluates the question-answering capabilities of Large Language Models (LLMs) in Urdu, addressing a critical gap in low-resource language processing. Four models GPT-4, mBERT, XLM-R, and mT5 are assessed across monolingual, cross-lingual, and mixed-language settings using the UQuAD1.0 and SQuAD2.0 datasets. Results reveal significant performance gaps between English and Urdu processing, with GPT-4 achieving the highest F1 scores (89.1% in English, 76.4% in Urdu) while demonstrating relative robustness in cross-lingual scenarios. Boundary detection and translation mismatches emerge as primary challenges, particularly in cross-lingual settings. The study further demonstrates that question complexity and length significantly impact performance, with factoid questions yielding 14.2% higher F1 scores compared to complex questions. These findings establish important benchmarks for enhancing LLM performance in low-resource languages and identify key areas for improvement in multilingual question-answering systems.</abstract>
<identifier type="citekey">kazi-etal-2025-crossing</identifier>
<location>
<url>https://aclanthology.org/2025.indonlp-1.17/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>141</start>
<end>151</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Crossing Language Boundaries: Evaluation of Large Language Models on Urdu-English Question Answering
%A Kazi, Samreen
%A Rahim, Maria
%A Khoja, Shakeel Ahmed
%Y Weerasinghe, Ruvan
%Y Anuradha, Isuri
%Y Sumanathilaka, Deshan
%S Proceedings of the First Workshop on Natural Language Processing for Indo-Aryan and Dravidian Languages
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi
%F kazi-etal-2025-crossing
%X This study evaluates the question-answering capabilities of Large Language Models (LLMs) in Urdu, addressing a critical gap in low-resource language processing. Four models GPT-4, mBERT, XLM-R, and mT5 are assessed across monolingual, cross-lingual, and mixed-language settings using the UQuAD1.0 and SQuAD2.0 datasets. Results reveal significant performance gaps between English and Urdu processing, with GPT-4 achieving the highest F1 scores (89.1% in English, 76.4% in Urdu) while demonstrating relative robustness in cross-lingual scenarios. Boundary detection and translation mismatches emerge as primary challenges, particularly in cross-lingual settings. The study further demonstrates that question complexity and length significantly impact performance, with factoid questions yielding 14.2% higher F1 scores compared to complex questions. These findings establish important benchmarks for enhancing LLM performance in low-resource languages and identify key areas for improvement in multilingual question-answering systems.
%U https://aclanthology.org/2025.indonlp-1.17/
%P 141-151
Markdown (Informal)
[Crossing Language Boundaries: Evaluation of Large Language Models on Urdu-English Question Answering](https://aclanthology.org/2025.indonlp-1.17/) (Kazi et al., IndoNLP 2025)
ACL