@inproceedings{haltiuk-smywinski-pohl-2025-path,
title = "On the Path to Make {U}krainian a High-Resource Language",
author = "Haltiuk, Mykola and
Smywi{\'n}ski-Pohl, Aleksander",
editor = "Romanyshyn, Mariana",
booktitle = "Proceedings of the Fourth Ukrainian Natural Language Processing Workshop (UNLP 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria (online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.unlp-1.14/",
doi = "10.18653/v1/2025.unlp-1.14",
pages = "120--130",
ISBN = "979-8-89176-269-5",
abstract = "Recent advances in multilingual language modeling have highlighted the importance of high-quality, large-scale datasets in enabling robust performance across languages. However, many low- and mid-resource languages, including Ukrainian, remain significantly underrepresented in existing pretraining corpora. We present Kobza, a large-scale Ukrainian text corpus containing nearly 60 billion tokens, aimed at improving the quality and scale of Ukrainian data available for training multilingual language models. We constructed Kobza from diverse, high-quality sources and applied rigorous deduplication to maximize data utility. Using this dataset, we pre-trained Modern-LiBERTa, the first Ukrainian transformer encoder capable of handling long contexts (up to 8192 tokens). Modern-LiBERTa achieves competitive results on various standard Ukrainian NLP benchmarks, particularly benefiting tasks that require broader contextual understanding or background knowledge. Our goal is to support future efforts to develop robust Ukrainian language models and to encourage greater inclusion of Ukrainian data in multilingual NLP research."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="haltiuk-smywinski-pohl-2025-path">
<titleInfo>
<title>On the Path to Make Ukrainian a High-Resource Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mykola</namePart>
<namePart type="family">Haltiuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aleksander</namePart>
<namePart type="family">Smywiński-Pohl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Ukrainian Natural Language Processing Workshop (UNLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="family">Romanyshyn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria (online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-269-5</identifier>
</relatedItem>
<abstract>Recent advances in multilingual language modeling have highlighted the importance of high-quality, large-scale datasets in enabling robust performance across languages. However, many low- and mid-resource languages, including Ukrainian, remain significantly underrepresented in existing pretraining corpora. We present Kobza, a large-scale Ukrainian text corpus containing nearly 60 billion tokens, aimed at improving the quality and scale of Ukrainian data available for training multilingual language models. We constructed Kobza from diverse, high-quality sources and applied rigorous deduplication to maximize data utility. Using this dataset, we pre-trained Modern-LiBERTa, the first Ukrainian transformer encoder capable of handling long contexts (up to 8192 tokens). Modern-LiBERTa achieves competitive results on various standard Ukrainian NLP benchmarks, particularly benefiting tasks that require broader contextual understanding or background knowledge. Our goal is to support future efforts to develop robust Ukrainian language models and to encourage greater inclusion of Ukrainian data in multilingual NLP research.</abstract>
<identifier type="citekey">haltiuk-smywinski-pohl-2025-path</identifier>
<identifier type="doi">10.18653/v1/2025.unlp-1.14</identifier>
<location>
<url>https://aclanthology.org/2025.unlp-1.14/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>120</start>
<end>130</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Path to Make Ukrainian a High-Resource Language
%A Haltiuk, Mykola
%A Smywiński-Pohl, Aleksander
%Y Romanyshyn, Mariana
%S Proceedings of the Fourth Ukrainian Natural Language Processing Workshop (UNLP 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria (online)
%@ 979-8-89176-269-5
%F haltiuk-smywinski-pohl-2025-path
%X Recent advances in multilingual language modeling have highlighted the importance of high-quality, large-scale datasets in enabling robust performance across languages. However, many low- and mid-resource languages, including Ukrainian, remain significantly underrepresented in existing pretraining corpora. We present Kobza, a large-scale Ukrainian text corpus containing nearly 60 billion tokens, aimed at improving the quality and scale of Ukrainian data available for training multilingual language models. We constructed Kobza from diverse, high-quality sources and applied rigorous deduplication to maximize data utility. Using this dataset, we pre-trained Modern-LiBERTa, the first Ukrainian transformer encoder capable of handling long contexts (up to 8192 tokens). Modern-LiBERTa achieves competitive results on various standard Ukrainian NLP benchmarks, particularly benefiting tasks that require broader contextual understanding or background knowledge. Our goal is to support future efforts to develop robust Ukrainian language models and to encourage greater inclusion of Ukrainian data in multilingual NLP research.
%R 10.18653/v1/2025.unlp-1.14
%U https://aclanthology.org/2025.unlp-1.14/
%U https://doi.org/10.18653/v1/2025.unlp-1.14
%P 120-130
Markdown (Informal)
[On the Path to Make Ukrainian a High-Resource Language](https://aclanthology.org/2025.unlp-1.14/) (Haltiuk & Smywiński-Pohl, UNLP 2025)
ACL
- Mykola Haltiuk and Aleksander Smywiński-Pohl. 2025. On the Path to Make Ukrainian a High-Resource Language. In Proceedings of the Fourth Ukrainian Natural Language Processing Workshop (UNLP 2025), pages 120–130, Vienna, Austria (online). Association for Computational Linguistics.