@inproceedings{umar-etal-2026-anchoring,
title = "Anchoring the Judge: Curriculum-Based Adaptation and Reference-Anchored {MQM} for {LLM}-Based Machine Translation of an Unseen Low-Resource Language - A Case of Nupe",
author = "Umar, Umar Baba and
Bashir, Sulaimon Adebayo and
Mohammed, Abdulmalik Danlami",
editor = "Hettiarachchi, Hansi and
Ranasinghe, Tharindu and
Plum, Alistair and
Rayson, Paul and
Mitkov, Ruslan and
Gaber, Mohamed and
Premasiri, Damith and
Tan, Fiona Anting and
Uyangodage, Lasitha",
booktitle = "Proceedings of the Second Workshop on Language Models for Low-Resource Languages ({L}o{R}es{LM} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.loreslm-1.19/",
pages = "200--211",
ISBN = "979-8-89176-377-7",
abstract = "Adapting large language models (LLMs) for machine translation has shown strong performance in low-resource languages; however, their effectiveness for \textit{unseen, extremely low-resource languages} remains largely unexplored. We present \textbf{NupeMT-QLoRA}, a curriculum-based adaptation framework for the Nupe{--}English language pair. Our approach employs a two-stage QLoRA fine-tuning strategy: (i) initial training on 34k noisy parallel sentence pairs, followed by (ii) continued fine-tuning on a smaller, cleaner set of 12k bidirectional parallel sentences with explicit translation-direction tags. This staged curriculum stabilizes optimization and improves robustness under severe data scarcity.We further identify a reliability crisis in existing automatic evaluation metrics for unseen languages. Popular LLM-based judges such as GEMBA and xCOMET exhibit weak correlation with human judgments (Kendall{'}s $\tau \approx 0.21$) and low inter-rater reliability (Fleiss' $\kappa \approx 0.27$), largely due to fluency bias. To address this, we propose \textbf{Ref-Anchor-MQM}, a reference-anchored evaluation protocol that forces the judge to extract Key Semantic Units from a human reference before scoring.Experimental results show that NupeMT-QLoRA substantially outperforms NLLB-200, improving chrF++ from 22.73 to 41.10, while Ref-Anchor-MQM achieves significantly higher alignment with human evaluation ($\tau = 0.71$). Our framework provides a scalable pipeline for adapting and evaluating LLMs on languages with zero prior representation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="umar-etal-2026-anchoring">
<titleInfo>
<title>Anchoring the Judge: Curriculum-Based Adaptation and Reference-Anchored MQM for LLM-Based Machine Translation of an Unseen Low-Resource Language - A Case of Nupe</title>
</titleInfo>
<name type="personal">
<namePart type="given">Umar</namePart>
<namePart type="given">Baba</namePart>
<namePart type="family">Umar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sulaimon</namePart>
<namePart type="given">Adebayo</namePart>
<namePart type="family">Bashir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdulmalik</namePart>
<namePart type="given">Danlami</namePart>
<namePart type="family">Mohammed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hansi</namePart>
<namePart type="family">Hettiarachchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Ranasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alistair</namePart>
<namePart type="family">Plum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Gaber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Damith</namePart>
<namePart type="family">Premasiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fiona</namePart>
<namePart type="given">Anting</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lasitha</namePart>
<namePart type="family">Uyangodage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-377-7</identifier>
</relatedItem>
<abstract>Adapting large language models (LLMs) for machine translation has shown strong performance in low-resource languages; however, their effectiveness for unseen, extremely low-resource languages remains largely unexplored. We present NupeMT-QLoRA, a curriculum-based adaptation framework for the Nupe–English language pair. Our approach employs a two-stage QLoRA fine-tuning strategy: (i) initial training on 34k noisy parallel sentence pairs, followed by (ii) continued fine-tuning on a smaller, cleaner set of 12k bidirectional parallel sentences with explicit translation-direction tags. This staged curriculum stabilizes optimization and improves robustness under severe data scarcity.We further identify a reliability crisis in existing automatic evaluation metrics for unseen languages. Popular LLM-based judges such as GEMBA and xCOMET exhibit weak correlation with human judgments (Kendall’s τ \approx 0.21) and low inter-rater reliability (Fleiss’ ąppa \approx 0.27), largely due to fluency bias. To address this, we propose Ref-Anchor-MQM, a reference-anchored evaluation protocol that forces the judge to extract Key Semantic Units from a human reference before scoring.Experimental results show that NupeMT-QLoRA substantially outperforms NLLB-200, improving chrF++ from 22.73 to 41.10, while Ref-Anchor-MQM achieves significantly higher alignment with human evaluation (τ = 0.71). Our framework provides a scalable pipeline for adapting and evaluating LLMs on languages with zero prior representation.</abstract>
<identifier type="citekey">umar-etal-2026-anchoring</identifier>
<location>
<url>https://aclanthology.org/2026.loreslm-1.19/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>200</start>
<end>211</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Anchoring the Judge: Curriculum-Based Adaptation and Reference-Anchored MQM for LLM-Based Machine Translation of an Unseen Low-Resource Language - A Case of Nupe
%A Umar, Umar Baba
%A Bashir, Sulaimon Adebayo
%A Mohammed, Abdulmalik Danlami
%Y Hettiarachchi, Hansi
%Y Ranasinghe, Tharindu
%Y Plum, Alistair
%Y Rayson, Paul
%Y Mitkov, Ruslan
%Y Gaber, Mohamed
%Y Premasiri, Damith
%Y Tan, Fiona Anting
%Y Uyangodage, Lasitha
%S Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-377-7
%F umar-etal-2026-anchoring
%X Adapting large language models (LLMs) for machine translation has shown strong performance in low-resource languages; however, their effectiveness for unseen, extremely low-resource languages remains largely unexplored. We present NupeMT-QLoRA, a curriculum-based adaptation framework for the Nupe–English language pair. Our approach employs a two-stage QLoRA fine-tuning strategy: (i) initial training on 34k noisy parallel sentence pairs, followed by (ii) continued fine-tuning on a smaller, cleaner set of 12k bidirectional parallel sentences with explicit translation-direction tags. This staged curriculum stabilizes optimization and improves robustness under severe data scarcity.We further identify a reliability crisis in existing automatic evaluation metrics for unseen languages. Popular LLM-based judges such as GEMBA and xCOMET exhibit weak correlation with human judgments (Kendall’s τ \approx 0.21) and low inter-rater reliability (Fleiss’ ąppa \approx 0.27), largely due to fluency bias. To address this, we propose Ref-Anchor-MQM, a reference-anchored evaluation protocol that forces the judge to extract Key Semantic Units from a human reference before scoring.Experimental results show that NupeMT-QLoRA substantially outperforms NLLB-200, improving chrF++ from 22.73 to 41.10, while Ref-Anchor-MQM achieves significantly higher alignment with human evaluation (τ = 0.71). Our framework provides a scalable pipeline for adapting and evaluating LLMs on languages with zero prior representation.
%U https://aclanthology.org/2026.loreslm-1.19/
%P 200-211
Markdown (Informal)
[Anchoring the Judge: Curriculum-Based Adaptation and Reference-Anchored MQM for LLM-Based Machine Translation of an Unseen Low-Resource Language - A Case of Nupe](https://aclanthology.org/2026.loreslm-1.19/) (Umar et al., LoResLM 2026)
ACL