@inproceedings{shpigunov-2026-improving,
title = "Improving Domain-Specific Translation from {E}nglish into {U}krainian with Retrieval-Augmented Generation",
author = "Shpigunov, Anton",
editor = "Romanyshyn, Mariana",
booktitle = "Proceedings of the Fifth {U}krainian Natural Language Processing Conference ({UNLP} 2026)",
month = may,
year = "2026",
address = "Lviv, Ukraine",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.unlp-1.1/",
pages = "1--11",
ISBN = "979-8-89176-359-3",
abstract = "Large language models have demonstrated competence as language translators, including for lower-resourced languages like Ukrainian. However, in specialized or novel domains, translation quality can suffer without adequate lexical and stylistic reference material. We present a retrieval-augmented approach to English-Ukrainian machine translation in a narrow domain: a private legal/military bilingual corpus. In this approach, semantically similar translation units retrieved via vector embeddings are provided as in-context examples to the LLM. We evaluate three open-weight Gemma 3 models, 4B, 12B, and 27B, against Gemini 3 Flash as a baseline across five augmentation conditions, with k values of 0, 3, 5, 10, and 25, on a 2,581-pair index and a 258-pair test set. We find that context augmentation yields statistically significant improvements in both ChrF++ and COMET for all models, with the smallest model{'}s COMET score improving by 0.076 at k = 3. However, smaller models exhibit context saturation: the 4B model{'}s performance peaks at k = 10 and degrades with additional context, losing 9.72 ChrF++ points and 0.007 COMET between k = 10 and k = 25, while larger models continue to benefit."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shpigunov-2026-improving">
<titleInfo>
<title>Improving Domain-Specific Translation from English into Ukrainian with Retrieval-Augmented Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anton</namePart>
<namePart type="family">Shpigunov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Ukrainian Natural Language Processing Conference (UNLP 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="family">Romanyshyn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Lviv, Ukraine</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-359-3</identifier>
</relatedItem>
<abstract>Large language models have demonstrated competence as language translators, including for lower-resourced languages like Ukrainian. However, in specialized or novel domains, translation quality can suffer without adequate lexical and stylistic reference material. We present a retrieval-augmented approach to English-Ukrainian machine translation in a narrow domain: a private legal/military bilingual corpus. In this approach, semantically similar translation units retrieved via vector embeddings are provided as in-context examples to the LLM. We evaluate three open-weight Gemma 3 models, 4B, 12B, and 27B, against Gemini 3 Flash as a baseline across five augmentation conditions, with k values of 0, 3, 5, 10, and 25, on a 2,581-pair index and a 258-pair test set. We find that context augmentation yields statistically significant improvements in both ChrF++ and COMET for all models, with the smallest model’s COMET score improving by 0.076 at k = 3. However, smaller models exhibit context saturation: the 4B model’s performance peaks at k = 10 and degrades with additional context, losing 9.72 ChrF++ points and 0.007 COMET between k = 10 and k = 25, while larger models continue to benefit.</abstract>
<identifier type="citekey">shpigunov-2026-improving</identifier>
<location>
<url>https://aclanthology.org/2026.unlp-1.1/</url>
</location>
<part>
<date>2026-05</date>
<extent unit="page">
<start>1</start>
<end>11</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Domain-Specific Translation from English into Ukrainian with Retrieval-Augmented Generation
%A Shpigunov, Anton
%Y Romanyshyn, Mariana
%S Proceedings of the Fifth Ukrainian Natural Language Processing Conference (UNLP 2026)
%D 2026
%8 May
%I Association for Computational Linguistics
%C Lviv, Ukraine
%@ 979-8-89176-359-3
%F shpigunov-2026-improving
%X Large language models have demonstrated competence as language translators, including for lower-resourced languages like Ukrainian. However, in specialized or novel domains, translation quality can suffer without adequate lexical and stylistic reference material. We present a retrieval-augmented approach to English-Ukrainian machine translation in a narrow domain: a private legal/military bilingual corpus. In this approach, semantically similar translation units retrieved via vector embeddings are provided as in-context examples to the LLM. We evaluate three open-weight Gemma 3 models, 4B, 12B, and 27B, against Gemini 3 Flash as a baseline across five augmentation conditions, with k values of 0, 3, 5, 10, and 25, on a 2,581-pair index and a 258-pair test set. We find that context augmentation yields statistically significant improvements in both ChrF++ and COMET for all models, with the smallest model’s COMET score improving by 0.076 at k = 3. However, smaller models exhibit context saturation: the 4B model’s performance peaks at k = 10 and degrades with additional context, losing 9.72 ChrF++ points and 0.007 COMET between k = 10 and k = 25, while larger models continue to benefit.
%U https://aclanthology.org/2026.unlp-1.1/
%P 1-11
Markdown (Informal)
[Improving Domain-Specific Translation from English into Ukrainian with Retrieval-Augmented Generation](https://aclanthology.org/2026.unlp-1.1/) (Shpigunov, UNLP 2026)
ACL