@inproceedings{kazi-khoja-2026-qari,
title = "{QARI}: Neural Architecture for {U}rdu Extractive Machine Reading Comprehension",
author = "Kazi, Samreen and
Khoja, Shakeel Ahmed",
editor = "Hettiarachchi, Hansi and
Ranasinghe, Tharindu and
Plum, Alistair and
Rayson, Paul and
Mitkov, Ruslan and
Gaber, Mohamed and
Premasiri, Damith and
Tan, Fiona Anting and
Uyangodage, Lasitha",
booktitle = "Proceedings of the Second Workshop on Language Models for Low-Resource Languages ({L}o{R}es{LM} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.loreslm-1.16/",
pages = "168--177",
ISBN = "979-8-89176-377-7",
abstract = "Urdu, a morphologically rich and low-resource language spoken by over 300 million people, poses unique challenges for extractive machine reading comprehension (EMRC), particularly in accurately identifying span boundaries involving postpositions and copulas. Existing multilingual models struggle with subword fragmentation and imprecise span extraction in such settings. We introduce QARI (قاری, ``reader''), a character-enhanced architecture for Urdu extractive MRC that augments pretrained multilingual encoders with three innovations: (1) a character-level CNN that captures affix patterns and morphological features from full word forms; (2) a gated fusion mechanism that integrates semantic and morphological representations; and (3) a boundary-contrastive learning objective targeting Urdu-specific span errors. Evaluated on UQuAD+, the first native Urdu MRC benchmark, QARI achieves 83.5 F1, a 5.5 point improvement over the previous best result (mT5, 78.0 F1), setting a new state of the art. Ablations show that character-level modeling and boundary supervision contribute +7.5 and +7.0 F1, respectively. Cross-dataset evaluations on UQA and UrFQuAD confirm QARI{'}s robustness. Error analysis reveals significant reductions in boundary drift, with improvements most notable for short factual questions."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kazi-khoja-2026-qari">
<titleInfo>
<title>QARI: Neural Architecture for Urdu Extractive Machine Reading Comprehension</title>
</titleInfo>
<name type="personal">
<namePart type="given">Samreen</namePart>
<namePart type="family">Kazi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shakeel</namePart>
<namePart type="given">Ahmed</namePart>
<namePart type="family">Khoja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hansi</namePart>
<namePart type="family">Hettiarachchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Ranasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alistair</namePart>
<namePart type="family">Plum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Gaber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Damith</namePart>
<namePart type="family">Premasiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fiona</namePart>
<namePart type="given">Anting</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lasitha</namePart>
<namePart type="family">Uyangodage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-377-7</identifier>
</relatedItem>
<abstract>Urdu, a morphologically rich and low-resource language spoken by over 300 million people, poses unique challenges for extractive machine reading comprehension (EMRC), particularly in accurately identifying span boundaries involving postpositions and copulas. Existing multilingual models struggle with subword fragmentation and imprecise span extraction in such settings. We introduce QARI (قاری, “reader”), a character-enhanced architecture for Urdu extractive MRC that augments pretrained multilingual encoders with three innovations: (1) a character-level CNN that captures affix patterns and morphological features from full word forms; (2) a gated fusion mechanism that integrates semantic and morphological representations; and (3) a boundary-contrastive learning objective targeting Urdu-specific span errors. Evaluated on UQuAD+, the first native Urdu MRC benchmark, QARI achieves 83.5 F1, a 5.5 point improvement over the previous best result (mT5, 78.0 F1), setting a new state of the art. Ablations show that character-level modeling and boundary supervision contribute +7.5 and +7.0 F1, respectively. Cross-dataset evaluations on UQA and UrFQuAD confirm QARI’s robustness. Error analysis reveals significant reductions in boundary drift, with improvements most notable for short factual questions.</abstract>
<identifier type="citekey">kazi-khoja-2026-qari</identifier>
<location>
<url>https://aclanthology.org/2026.loreslm-1.16/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>168</start>
<end>177</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T QARI: Neural Architecture for Urdu Extractive Machine Reading Comprehension
%A Kazi, Samreen
%A Khoja, Shakeel Ahmed
%Y Hettiarachchi, Hansi
%Y Ranasinghe, Tharindu
%Y Plum, Alistair
%Y Rayson, Paul
%Y Mitkov, Ruslan
%Y Gaber, Mohamed
%Y Premasiri, Damith
%Y Tan, Fiona Anting
%Y Uyangodage, Lasitha
%S Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-377-7
%F kazi-khoja-2026-qari
%X Urdu, a morphologically rich and low-resource language spoken by over 300 million people, poses unique challenges for extractive machine reading comprehension (EMRC), particularly in accurately identifying span boundaries involving postpositions and copulas. Existing multilingual models struggle with subword fragmentation and imprecise span extraction in such settings. We introduce QARI (قاری, “reader”), a character-enhanced architecture for Urdu extractive MRC that augments pretrained multilingual encoders with three innovations: (1) a character-level CNN that captures affix patterns and morphological features from full word forms; (2) a gated fusion mechanism that integrates semantic and morphological representations; and (3) a boundary-contrastive learning objective targeting Urdu-specific span errors. Evaluated on UQuAD+, the first native Urdu MRC benchmark, QARI achieves 83.5 F1, a 5.5 point improvement over the previous best result (mT5, 78.0 F1), setting a new state of the art. Ablations show that character-level modeling and boundary supervision contribute +7.5 and +7.0 F1, respectively. Cross-dataset evaluations on UQA and UrFQuAD confirm QARI’s robustness. Error analysis reveals significant reductions in boundary drift, with improvements most notable for short factual questions.
%U https://aclanthology.org/2026.loreslm-1.16/
%P 168-177
Markdown (Informal)
[QARI: Neural Architecture for Urdu Extractive Machine Reading Comprehension](https://aclanthology.org/2026.loreslm-1.16/) (Kazi & Khoja, LoResLM 2026)
ACL