@inproceedings{khongthaw-etal-2025-dods,
title = "{D}o{DS}-{IITPKD}:Submissions to the {WMT}25 Low-Resource {I}ndic Language Translation Task",
author = "Khongthaw, Ontiwell and
Salvin, G.l. and
Budde, Shrikant and
Chigwededza, Abigairl and
Malkar, Dhruvadeep and
Hingmire, Swapnil",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Tenth Conference on Machine Translation",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.wmt-1.102/",
pages = "1248--1252",
ISBN = "979-8-89176-341-8",
abstract = "Low-resource translation for Indic languages poses significant challenges due to limited parallel corpora and linguistic diversity. In this work, we describe our participation in the WMT 2025 shared task for four Indic languages-Khasi, Mizo, Assamese, which is categorized into Category 1 and Bodo in Cate- gory 2. For our PRIMARY submission, we fine- tuned the distilled NLLB-200 model on bidi- rectional English{\ensuremath{\leftrightarrow}}Khasi and English{\ensuremath{\leftrightarrow}}Mizo data, and employed the IndicTrans2 model family for Assamese and Bodo translation. Our CONTRASTIVE submission augments training with external corpora from PMIN- DIA and Google SMOL to further enrich low- resource data coverage. Both systems lever- age Low-Rank Adaptation (LoRA) within a parameter-efficient fine-tuning framework, en- abling lightweight adapter training atop frozen pretrained weights. The translation pipeline was developed using the Hugging Face Trans- formers and PEFT libraries, augmented with bespoke preprocessing modules that append both language and domain identifiers to each instance. We evaluated our approach on par- allel corpora spanning multiple domains- ar- ticle based, newswire, scientific, and biblical texts as provided by the WMT25 dataset, under conditions of severe data scarcity. Fine-tuning lightweight LoRA adapters on targeted parallel corpora yields marked improvements in evalua- tion metrics, confirming their effectiveness for cross-domain adaptation in low-resource Indic languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="khongthaw-etal-2025-dods">
<titleInfo>
<title>DoDS-IITPKD:Submissions to the WMT25 Low-Resource Indic Language Translation Task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ontiwell</namePart>
<namePart type="family">Khongthaw</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">G.l.</namePart>
<namePart type="family">Salvin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shrikant</namePart>
<namePart type="family">Budde</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abigairl</namePart>
<namePart type="family">Chigwededza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhruvadeep</namePart>
<namePart type="family">Malkar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Swapnil</namePart>
<namePart type="family">Hingmire</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-341-8</identifier>
</relatedItem>
<abstract>Low-resource translation for Indic languages poses significant challenges due to limited parallel corpora and linguistic diversity. In this work, we describe our participation in the WMT 2025 shared task for four Indic languages-Khasi, Mizo, Assamese, which is categorized into Category 1 and Bodo in Cate- gory 2. For our PRIMARY submission, we fine- tuned the distilled NLLB-200 model on bidi- rectional English\ensuremathłeftrightarrowKhasi and English\ensuremathłeftrightarrowMizo data, and employed the IndicTrans2 model family for Assamese and Bodo translation. Our CONTRASTIVE submission augments training with external corpora from PMIN- DIA and Google SMOL to further enrich low- resource data coverage. Both systems lever- age Low-Rank Adaptation (LoRA) within a parameter-efficient fine-tuning framework, en- abling lightweight adapter training atop frozen pretrained weights. The translation pipeline was developed using the Hugging Face Trans- formers and PEFT libraries, augmented with bespoke preprocessing modules that append both language and domain identifiers to each instance. We evaluated our approach on par- allel corpora spanning multiple domains- ar- ticle based, newswire, scientific, and biblical texts as provided by the WMT25 dataset, under conditions of severe data scarcity. Fine-tuning lightweight LoRA adapters on targeted parallel corpora yields marked improvements in evalua- tion metrics, confirming their effectiveness for cross-domain adaptation in low-resource Indic languages.</abstract>
<identifier type="citekey">khongthaw-etal-2025-dods</identifier>
<location>
<url>https://aclanthology.org/2025.wmt-1.102/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>1248</start>
<end>1252</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DoDS-IITPKD:Submissions to the WMT25 Low-Resource Indic Language Translation Task
%A Khongthaw, Ontiwell
%A Salvin, G.l.
%A Budde, Shrikant
%A Chigwededza, Abigairl
%A Malkar, Dhruvadeep
%A Hingmire, Swapnil
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Koehn, Philipp
%Y Monz, Christof
%S Proceedings of the Tenth Conference on Machine Translation
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-341-8
%F khongthaw-etal-2025-dods
%X Low-resource translation for Indic languages poses significant challenges due to limited parallel corpora and linguistic diversity. In this work, we describe our participation in the WMT 2025 shared task for four Indic languages-Khasi, Mizo, Assamese, which is categorized into Category 1 and Bodo in Cate- gory 2. For our PRIMARY submission, we fine- tuned the distilled NLLB-200 model on bidi- rectional English\ensuremathłeftrightarrowKhasi and English\ensuremathłeftrightarrowMizo data, and employed the IndicTrans2 model family for Assamese and Bodo translation. Our CONTRASTIVE submission augments training with external corpora from PMIN- DIA and Google SMOL to further enrich low- resource data coverage. Both systems lever- age Low-Rank Adaptation (LoRA) within a parameter-efficient fine-tuning framework, en- abling lightweight adapter training atop frozen pretrained weights. The translation pipeline was developed using the Hugging Face Trans- formers and PEFT libraries, augmented with bespoke preprocessing modules that append both language and domain identifiers to each instance. We evaluated our approach on par- allel corpora spanning multiple domains- ar- ticle based, newswire, scientific, and biblical texts as provided by the WMT25 dataset, under conditions of severe data scarcity. Fine-tuning lightweight LoRA adapters on targeted parallel corpora yields marked improvements in evalua- tion metrics, confirming their effectiveness for cross-domain adaptation in low-resource Indic languages.
%U https://aclanthology.org/2025.wmt-1.102/
%P 1248-1252
Markdown (Informal)
[DoDS-IITPKD:Submissions to the WMT25 Low-Resource Indic Language Translation Task](https://aclanthology.org/2025.wmt-1.102/) (Khongthaw et al., WMT 2025)
ACL