@inproceedings{kyslyi-etal-2025-vuyko,
title = "Vuyko Mistral: Adapting {LLM}s for Low-Resource Dialectal Translation",
author = "Kyslyi, Roman and
Maksymiuk, Yuliia and
Pysmennyi, Ihor",
editor = "Romanyshyn, Mariana",
booktitle = "Proceedings of the Fourth Ukrainian Natural Language Processing Workshop (UNLP 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria (online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.unlp-1.10/",
doi = "10.18653/v1/2025.unlp-1.10",
pages = "86--95",
ISBN = "979-8-89176-269-5",
abstract = "In this paper we introduce the first effort to adapt large language models (LLMs) to the Ukrainian dialect (in our case Hutsul), a low-resource and morphologically complex dialect spoken in the Carpathian Highlands. We created a parallel corpus of 9852 dialect-to-standard Ukrainian sentence pairs and a dictionary of 7320 dialectal word mappings. We also addressed data shortage by proposing an advanced Retrieval-Augmented Generation (RAG) pipeline to generate synthetic parallel translation pairs, expanding the corpus with 52142 examples. We have fine-tuned multiple open-source LLMs using LoRA and evaluated them on a standard-to-dialect translation task, also comparing with few-shot GPT-4o translation. In the absence of human annotators, we adopt a multi-metric evaluation strategy combining BLEU, chrF++, TER, and LLM-based judgment (GPT-4o). The results show that even small(7B) finetuned models outperform zero-shot baselines such as GPT-4o across both automatic and LLM-evaluated metrics. All data, models, and code are publicly released at: https://github.com/woters/vuyko-hutsul."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kyslyi-etal-2025-vuyko">
<titleInfo>
<title>Vuyko Mistral: Adapting LLMs for Low-Resource Dialectal Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Kyslyi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuliia</namePart>
<namePart type="family">Maksymiuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ihor</namePart>
<namePart type="family">Pysmennyi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Ukrainian Natural Language Processing Workshop (UNLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="family">Romanyshyn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria (online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-269-5</identifier>
</relatedItem>
<abstract>In this paper we introduce the first effort to adapt large language models (LLMs) to the Ukrainian dialect (in our case Hutsul), a low-resource and morphologically complex dialect spoken in the Carpathian Highlands. We created a parallel corpus of 9852 dialect-to-standard Ukrainian sentence pairs and a dictionary of 7320 dialectal word mappings. We also addressed data shortage by proposing an advanced Retrieval-Augmented Generation (RAG) pipeline to generate synthetic parallel translation pairs, expanding the corpus with 52142 examples. We have fine-tuned multiple open-source LLMs using LoRA and evaluated them on a standard-to-dialect translation task, also comparing with few-shot GPT-4o translation. In the absence of human annotators, we adopt a multi-metric evaluation strategy combining BLEU, chrF++, TER, and LLM-based judgment (GPT-4o). The results show that even small(7B) finetuned models outperform zero-shot baselines such as GPT-4o across both automatic and LLM-evaluated metrics. All data, models, and code are publicly released at: https://github.com/woters/vuyko-hutsul.</abstract>
<identifier type="citekey">kyslyi-etal-2025-vuyko</identifier>
<identifier type="doi">10.18653/v1/2025.unlp-1.10</identifier>
<location>
<url>https://aclanthology.org/2025.unlp-1.10/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>86</start>
<end>95</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Vuyko Mistral: Adapting LLMs for Low-Resource Dialectal Translation
%A Kyslyi, Roman
%A Maksymiuk, Yuliia
%A Pysmennyi, Ihor
%Y Romanyshyn, Mariana
%S Proceedings of the Fourth Ukrainian Natural Language Processing Workshop (UNLP 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria (online)
%@ 979-8-89176-269-5
%F kyslyi-etal-2025-vuyko
%X In this paper we introduce the first effort to adapt large language models (LLMs) to the Ukrainian dialect (in our case Hutsul), a low-resource and morphologically complex dialect spoken in the Carpathian Highlands. We created a parallel corpus of 9852 dialect-to-standard Ukrainian sentence pairs and a dictionary of 7320 dialectal word mappings. We also addressed data shortage by proposing an advanced Retrieval-Augmented Generation (RAG) pipeline to generate synthetic parallel translation pairs, expanding the corpus with 52142 examples. We have fine-tuned multiple open-source LLMs using LoRA and evaluated them on a standard-to-dialect translation task, also comparing with few-shot GPT-4o translation. In the absence of human annotators, we adopt a multi-metric evaluation strategy combining BLEU, chrF++, TER, and LLM-based judgment (GPT-4o). The results show that even small(7B) finetuned models outperform zero-shot baselines such as GPT-4o across both automatic and LLM-evaluated metrics. All data, models, and code are publicly released at: https://github.com/woters/vuyko-hutsul.
%R 10.18653/v1/2025.unlp-1.10
%U https://aclanthology.org/2025.unlp-1.10/
%U https://doi.org/10.18653/v1/2025.unlp-1.10
%P 86-95
Markdown (Informal)
[Vuyko Mistral: Adapting LLMs for Low-Resource Dialectal Translation](https://aclanthology.org/2025.unlp-1.10/) (Kyslyi et al., UNLP 2025)
ACL