@inproceedings{khamis-2026-navigating,
title = "Navigating Data Scarcity in Low-Resource {E}nglish-{T}atar Translation using {LLM} Fine-Tuning",
author = "Khamis, Ahmed Khaled",
editor = "Ojha, Atul Kr. and
Liu, Chao-hong and
Vylomova, Ekaterina and
Pirinen, Flammie and
Washington, Jonathan and
Oco, Nathaniel and
Zhao, Xiaobing",
booktitle = "Proceedings for the Ninth Workshop on Technologies for Machine Translation of Low Resource Languages ({L}o{R}es{MT} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.loresmt-1.16/",
pages = "198--202",
ISBN = "979-8-89176-366-1",
abstract = "The scarcity of high-quality parallel corpora remains the primary bottleneck for English-Tatar machine translation. While the OPUS project provides various datasets, our tests reveal that datasets like WikiMatrix, GNOME, and NLLB, suffer from significant noise and incorrect labeling, making them unsuitable for training robust encoder-decoder translation models that typically requires larger amount of high quality data. Furthermore, we demonstrate that small-scale multilingual Large Language Models (LLMs), such as Qwen3 (4B-30B), Gemma3 (4B-12B) and others, show severe ``Turkish interference'', and they frequently hallucinate Turkish vocabulary when prompted for Tatar.In this paper, we navigate this data scarcity by leveraging Llama 3.3 70B Instruct, which is the only model in our zero-shot benchmarks capable of maintaining distinct linguistic boundaries for Tatar. To address the lack of gold-standard data, we curated a synthetic dataset of 7,995 high-quality translation pairs using a frontier model as a teacher. We then performed 4-bit LoRA fine-tuning to train Llama for English-Tatar translation. Our results show a performance leap: while fine-tuning on the limited Tatoeba dataset (1,193 samples) yielded a CHRF++ score of 24.38, while fine-tuning on our synthetic dataset achieved 32.02 on the LoResMT 2026 shared task test set. We release our curated dataset and fine-tuned models to support further research in low-resource Turkic machine translation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="khamis-2026-navigating">
<titleInfo>
<title>Navigating Data Scarcity in Low-Resource English-Tatar Translation using LLM Fine-Tuning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="given">Khaled</namePart>
<namePart type="family">Khamis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings for the Ninth Workshop on Technologies for Machine Translation of Low Resource Languages (LoResMT 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Atul</namePart>
<namePart type="given">Kr.</namePart>
<namePart type="family">Ojha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chao-hong</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Vylomova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Flammie</namePart>
<namePart type="family">Pirinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="family">Washington</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nathaniel</namePart>
<namePart type="family">Oco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaobing</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-366-1</identifier>
</relatedItem>
<abstract>The scarcity of high-quality parallel corpora remains the primary bottleneck for English-Tatar machine translation. While the OPUS project provides various datasets, our tests reveal that datasets like WikiMatrix, GNOME, and NLLB, suffer from significant noise and incorrect labeling, making them unsuitable for training robust encoder-decoder translation models that typically requires larger amount of high quality data. Furthermore, we demonstrate that small-scale multilingual Large Language Models (LLMs), such as Qwen3 (4B-30B), Gemma3 (4B-12B) and others, show severe “Turkish interference”, and they frequently hallucinate Turkish vocabulary when prompted for Tatar.In this paper, we navigate this data scarcity by leveraging Llama 3.3 70B Instruct, which is the only model in our zero-shot benchmarks capable of maintaining distinct linguistic boundaries for Tatar. To address the lack of gold-standard data, we curated a synthetic dataset of 7,995 high-quality translation pairs using a frontier model as a teacher. We then performed 4-bit LoRA fine-tuning to train Llama for English-Tatar translation. Our results show a performance leap: while fine-tuning on the limited Tatoeba dataset (1,193 samples) yielded a CHRF++ score of 24.38, while fine-tuning on our synthetic dataset achieved 32.02 on the LoResMT 2026 shared task test set. We release our curated dataset and fine-tuned models to support further research in low-resource Turkic machine translation.</abstract>
<identifier type="citekey">khamis-2026-navigating</identifier>
<location>
<url>https://aclanthology.org/2026.loresmt-1.16/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>198</start>
<end>202</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Navigating Data Scarcity in Low-Resource English-Tatar Translation using LLM Fine-Tuning
%A Khamis, Ahmed Khaled
%Y Ojha, Atul Kr.
%Y Liu, Chao-hong
%Y Vylomova, Ekaterina
%Y Pirinen, Flammie
%Y Washington, Jonathan
%Y Oco, Nathaniel
%Y Zhao, Xiaobing
%S Proceedings for the Ninth Workshop on Technologies for Machine Translation of Low Resource Languages (LoResMT 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-366-1
%F khamis-2026-navigating
%X The scarcity of high-quality parallel corpora remains the primary bottleneck for English-Tatar machine translation. While the OPUS project provides various datasets, our tests reveal that datasets like WikiMatrix, GNOME, and NLLB, suffer from significant noise and incorrect labeling, making them unsuitable for training robust encoder-decoder translation models that typically requires larger amount of high quality data. Furthermore, we demonstrate that small-scale multilingual Large Language Models (LLMs), such as Qwen3 (4B-30B), Gemma3 (4B-12B) and others, show severe “Turkish interference”, and they frequently hallucinate Turkish vocabulary when prompted for Tatar.In this paper, we navigate this data scarcity by leveraging Llama 3.3 70B Instruct, which is the only model in our zero-shot benchmarks capable of maintaining distinct linguistic boundaries for Tatar. To address the lack of gold-standard data, we curated a synthetic dataset of 7,995 high-quality translation pairs using a frontier model as a teacher. We then performed 4-bit LoRA fine-tuning to train Llama for English-Tatar translation. Our results show a performance leap: while fine-tuning on the limited Tatoeba dataset (1,193 samples) yielded a CHRF++ score of 24.38, while fine-tuning on our synthetic dataset achieved 32.02 on the LoResMT 2026 shared task test set. We release our curated dataset and fine-tuned models to support further research in low-resource Turkic machine translation.
%U https://aclanthology.org/2026.loresmt-1.16/
%P 198-202
Markdown (Informal)
[Navigating Data Scarcity in Low-Resource English-Tatar Translation using LLM Fine-Tuning](https://aclanthology.org/2026.loresmt-1.16/) (Khamis, LoResMT 2026)
ACL