@inproceedings{faheem-etal-2026-u,
title = "{U}-{MIRAGE}: Benchmarking Chain-of-Thought Reasoning for {U}rdu Medical {QA}",
author = "Faheem, Ali and
Ullah, Faizad and
Hammad, Muhammad and
Hassan, Ahmed and
Ayub, Muhammad Sohaib and
Karim, Asim",
booktitle = "Proceedings of the 2nd Workshop on {NLP} for Languages Using {A}rabic Script",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.abjadnlp-1.55/",
pages = "453--460",
abstract = "Medical AI systems increasingly rely on large language models (LLMs), yet their deployment in linguistically diverse regions remains unexplored. We address this gap by introducing U-MIRAGE, the first medical question-answering benchmark for Urdu and Roman Urdu. Urdu is the 11th most spoken language (with over 246 million speakers) worldwide. Our systematic evaluation of six state-of-the-art LLMs reveals three main findings. (1) 6{\%} to 10{\%} drop in performance when moving from English to Urdu variants, even though medical knowledge should theoretically transfer across languages. (2) Chain-of-Thought (CoT) prompting improves small models by 8{\%} to 20{\%}, while surprisingly the larger models' performance degraded by up to 3{\%}. (3) Quantized small models fail catastrophically in low-resource languages, achieving near-random accuracy regardless of various prompting strategies. These findings challenge core assumptions about multilingual medical AI systems. Roman Urdu consistently outperforms standard Urdu script, suggesting orthographic alignment with pre-training data matters more than linguistic proximity. CoT prompting effectiveness depends critically on model architecture rather than task complexity alone. CoT prompting effectiveness depends critically on model architecture rather than task complexity alone. Our contributions are threefold: (1) U-MIRAGE, (2) systematic benchmarking of LLMs for Urdu and Roman Urdu medical reasoning, and (3) empirical analysis of CoT prompting in low-resource contexts. Our code and datasets are publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="faheem-etal-2026-u">
<titleInfo>
<title>U-MIRAGE: Benchmarking Chain-of-Thought Reasoning for Urdu Medical QA</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Faheem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Faizad</namePart>
<namePart type="family">Ullah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="family">Hammad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Hassan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="given">Sohaib</namePart>
<namePart type="family">Ayub</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asim</namePart>
<namePart type="family">Karim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Medical AI systems increasingly rely on large language models (LLMs), yet their deployment in linguistically diverse regions remains unexplored. We address this gap by introducing U-MIRAGE, the first medical question-answering benchmark for Urdu and Roman Urdu. Urdu is the 11th most spoken language (with over 246 million speakers) worldwide. Our systematic evaluation of six state-of-the-art LLMs reveals three main findings. (1) 6% to 10% drop in performance when moving from English to Urdu variants, even though medical knowledge should theoretically transfer across languages. (2) Chain-of-Thought (CoT) prompting improves small models by 8% to 20%, while surprisingly the larger models’ performance degraded by up to 3%. (3) Quantized small models fail catastrophically in low-resource languages, achieving near-random accuracy regardless of various prompting strategies. These findings challenge core assumptions about multilingual medical AI systems. Roman Urdu consistently outperforms standard Urdu script, suggesting orthographic alignment with pre-training data matters more than linguistic proximity. CoT prompting effectiveness depends critically on model architecture rather than task complexity alone. CoT prompting effectiveness depends critically on model architecture rather than task complexity alone. Our contributions are threefold: (1) U-MIRAGE, (2) systematic benchmarking of LLMs for Urdu and Roman Urdu medical reasoning, and (3) empirical analysis of CoT prompting in low-resource contexts. Our code and datasets are publicly available.</abstract>
<identifier type="citekey">faheem-etal-2026-u</identifier>
<location>
<url>https://aclanthology.org/2026.abjadnlp-1.55/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>453</start>
<end>460</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T U-MIRAGE: Benchmarking Chain-of-Thought Reasoning for Urdu Medical QA
%A Faheem, Ali
%A Ullah, Faizad
%A Hammad, Muhammad
%A Hassan, Ahmed
%A Ayub, Muhammad Sohaib
%A Karim, Asim
%S Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%F faheem-etal-2026-u
%X Medical AI systems increasingly rely on large language models (LLMs), yet their deployment in linguistically diverse regions remains unexplored. We address this gap by introducing U-MIRAGE, the first medical question-answering benchmark for Urdu and Roman Urdu. Urdu is the 11th most spoken language (with over 246 million speakers) worldwide. Our systematic evaluation of six state-of-the-art LLMs reveals three main findings. (1) 6% to 10% drop in performance when moving from English to Urdu variants, even though medical knowledge should theoretically transfer across languages. (2) Chain-of-Thought (CoT) prompting improves small models by 8% to 20%, while surprisingly the larger models’ performance degraded by up to 3%. (3) Quantized small models fail catastrophically in low-resource languages, achieving near-random accuracy regardless of various prompting strategies. These findings challenge core assumptions about multilingual medical AI systems. Roman Urdu consistently outperforms standard Urdu script, suggesting orthographic alignment with pre-training data matters more than linguistic proximity. CoT prompting effectiveness depends critically on model architecture rather than task complexity alone. CoT prompting effectiveness depends critically on model architecture rather than task complexity alone. Our contributions are threefold: (1) U-MIRAGE, (2) systematic benchmarking of LLMs for Urdu and Roman Urdu medical reasoning, and (3) empirical analysis of CoT prompting in low-resource contexts. Our code and datasets are publicly available.
%U https://aclanthology.org/2026.abjadnlp-1.55/
%P 453-460
Markdown (Informal)
[U-MIRAGE: Benchmarking Chain-of-Thought Reasoning for Urdu Medical QA](https://aclanthology.org/2026.abjadnlp-1.55/) (Faheem et al., AbjadNLP 2026)
ACL