@inproceedings{jumashev-etal-2025-kyrgyz,
title = "The {K}yrgyz Seed Dataset Submission to the {WMT}25 Open Language Data Initiative Shared Task",
author = "Jumashev, Murat and
Tillabaeva, Alina and
Kasieva, Aida and
Omurkanov, Turgunbek and
Musaeva, Akylai and
Emil Kyzy, Meerim and
Chagataeva, Gulaiym and
Washington, Jonathan",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Tenth Conference on Machine Translation",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.wmt-1.84/",
pages = "1088--1102",
ISBN = "979-8-89176-341-8",
abstract = "We present a Kyrgyz language seed dataset as part of our contribution to the WMT25 Open Language Data Initiative (OLDI) shared task. This paper details the process of collecting and curating English{--}Kyrgyz translations, highlighting the main challenges encountered in translating into a morphologically rich, low-resource language. We demonstrate the quality of the dataset through fine-tuning experiments, showing consistent improvements in machine translation performance across multiple models. Comparisons with bilingual and MNMT Kyrgyz-English baselines reveal that, for some models, our dataset enables performance surpassing pretrained baselines in both English{--}Kyrgyz and Kyrgyz{--}English translation directions. These results validate the dataset{'}s utility and suggest that it can serve as a valuable resource for the Kyrgyz MT community and other related low-resource languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jumashev-etal-2025-kyrgyz">
<titleInfo>
<title>The Kyrgyz Seed Dataset Submission to the WMT25 Open Language Data Initiative Shared Task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Murat</namePart>
<namePart type="family">Jumashev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alina</namePart>
<namePart type="family">Tillabaeva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aida</namePart>
<namePart type="family">Kasieva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Turgunbek</namePart>
<namePart type="family">Omurkanov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akylai</namePart>
<namePart type="family">Musaeva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meerim</namePart>
<namePart type="family">Emil Kyzy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gulaiym</namePart>
<namePart type="family">Chagataeva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="family">Washington</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-341-8</identifier>
</relatedItem>
<abstract>We present a Kyrgyz language seed dataset as part of our contribution to the WMT25 Open Language Data Initiative (OLDI) shared task. This paper details the process of collecting and curating English–Kyrgyz translations, highlighting the main challenges encountered in translating into a morphologically rich, low-resource language. We demonstrate the quality of the dataset through fine-tuning experiments, showing consistent improvements in machine translation performance across multiple models. Comparisons with bilingual and MNMT Kyrgyz-English baselines reveal that, for some models, our dataset enables performance surpassing pretrained baselines in both English–Kyrgyz and Kyrgyz–English translation directions. These results validate the dataset’s utility and suggest that it can serve as a valuable resource for the Kyrgyz MT community and other related low-resource languages.</abstract>
<identifier type="citekey">jumashev-etal-2025-kyrgyz</identifier>
<location>
<url>https://aclanthology.org/2025.wmt-1.84/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>1088</start>
<end>1102</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Kyrgyz Seed Dataset Submission to the WMT25 Open Language Data Initiative Shared Task
%A Jumashev, Murat
%A Tillabaeva, Alina
%A Kasieva, Aida
%A Omurkanov, Turgunbek
%A Musaeva, Akylai
%A Emil Kyzy, Meerim
%A Chagataeva, Gulaiym
%A Washington, Jonathan
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Koehn, Philipp
%Y Monz, Christof
%S Proceedings of the Tenth Conference on Machine Translation
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-341-8
%F jumashev-etal-2025-kyrgyz
%X We present a Kyrgyz language seed dataset as part of our contribution to the WMT25 Open Language Data Initiative (OLDI) shared task. This paper details the process of collecting and curating English–Kyrgyz translations, highlighting the main challenges encountered in translating into a morphologically rich, low-resource language. We demonstrate the quality of the dataset through fine-tuning experiments, showing consistent improvements in machine translation performance across multiple models. Comparisons with bilingual and MNMT Kyrgyz-English baselines reveal that, for some models, our dataset enables performance surpassing pretrained baselines in both English–Kyrgyz and Kyrgyz–English translation directions. These results validate the dataset’s utility and suggest that it can serve as a valuable resource for the Kyrgyz MT community and other related low-resource languages.
%U https://aclanthology.org/2025.wmt-1.84/
%P 1088-1102
Markdown (Informal)
[The Kyrgyz Seed Dataset Submission to the WMT25 Open Language Data Initiative Shared Task](https://aclanthology.org/2025.wmt-1.84/) (Jumashev et al., WMT 2025)
ACL
- Murat Jumashev, Alina Tillabaeva, Aida Kasieva, Turgunbek Omurkanov, Akylai Musaeva, Meerim Emil Kyzy, Gulaiym Chagataeva, and Jonathan Washington. 2025. The Kyrgyz Seed Dataset Submission to the WMT25 Open Language Data Initiative Shared Task. In Proceedings of the Tenth Conference on Machine Translation, pages 1088–1102, Suzhou, China. Association for Computational Linguistics.