@inproceedings{kiulian-etal-2025-english,
title = "From {E}nglish-Centric to Effective Bilingual: {LLM}s with Custom Tokenizers for Underrepresented Languages",
author = "Kiulian, Artur and
Polishko, Anton and
Khandoga, Mykola and
Kostiuk, Yevhen and
Gabrielli, Guillermo and
Gaga{\l}a, {\L}ukasz and
Zaraket, Fadi and
Abu Obaida, Qusai and
Garud, Hrishikesh and
Wing Yee Mak, Wendy and
Chaplynskyi, Dmytro and
Amor, Selma and
Peradze, Grigol",
editor = "Romanyshyn, Mariana",
booktitle = "Proceedings of the Fourth Ukrainian Natural Language Processing Workshop (UNLP 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria (online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.unlp-1.1/",
doi = "10.18653/v1/2025.unlp-1.1",
pages = "1--13",
ISBN = "979-8-89176-269-5",
abstract = "In this paper, we propose a model-agnostic cost-effective approach to developing bilingual base large language models (LLMs) to support English and any target language. The method includes vocabulary expansion, initialization of new embeddings, model training and evaluation. We performed our experiments with three languages, each using a non-Latin script{---}Ukrainian, Arabic, and Georgian.Our approach demonstrates improved language performance while reducing computational costs. It mitigates the disproportionate penalization of underrepresented languages, promoting fairness and minimizing adverse phenomena such as code-switching and broken grammar. Additionally, we introduce new metrics to evaluate language quality, revealing that vocabulary size significantly impacts the quality of generated text."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kiulian-etal-2025-english">
<titleInfo>
<title>From English-Centric to Effective Bilingual: LLMs with Custom Tokenizers for Underrepresented Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Artur</namePart>
<namePart type="family">Kiulian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anton</namePart>
<namePart type="family">Polishko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mykola</namePart>
<namePart type="family">Khandoga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yevhen</namePart>
<namePart type="family">Kostiuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guillermo</namePart>
<namePart type="family">Gabrielli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Łukasz</namePart>
<namePart type="family">Gagała</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fadi</namePart>
<namePart type="family">Zaraket</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qusai</namePart>
<namePart type="family">Abu Obaida</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hrishikesh</namePart>
<namePart type="family">Garud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wendy</namePart>
<namePart type="family">Wing Yee Mak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dmytro</namePart>
<namePart type="family">Chaplynskyi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Selma</namePart>
<namePart type="family">Amor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Grigol</namePart>
<namePart type="family">Peradze</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Ukrainian Natural Language Processing Workshop (UNLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="family">Romanyshyn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria (online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-269-5</identifier>
</relatedItem>
<abstract>In this paper, we propose a model-agnostic cost-effective approach to developing bilingual base large language models (LLMs) to support English and any target language. The method includes vocabulary expansion, initialization of new embeddings, model training and evaluation. We performed our experiments with three languages, each using a non-Latin script—Ukrainian, Arabic, and Georgian.Our approach demonstrates improved language performance while reducing computational costs. It mitigates the disproportionate penalization of underrepresented languages, promoting fairness and minimizing adverse phenomena such as code-switching and broken grammar. Additionally, we introduce new metrics to evaluate language quality, revealing that vocabulary size significantly impacts the quality of generated text.</abstract>
<identifier type="citekey">kiulian-etal-2025-english</identifier>
<identifier type="doi">10.18653/v1/2025.unlp-1.1</identifier>
<location>
<url>https://aclanthology.org/2025.unlp-1.1/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>1</start>
<end>13</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T From English-Centric to Effective Bilingual: LLMs with Custom Tokenizers for Underrepresented Languages
%A Kiulian, Artur
%A Polishko, Anton
%A Khandoga, Mykola
%A Kostiuk, Yevhen
%A Gabrielli, Guillermo
%A Gagała, Łukasz
%A Zaraket, Fadi
%A Abu Obaida, Qusai
%A Garud, Hrishikesh
%A Wing Yee Mak, Wendy
%A Chaplynskyi, Dmytro
%A Amor, Selma
%A Peradze, Grigol
%Y Romanyshyn, Mariana
%S Proceedings of the Fourth Ukrainian Natural Language Processing Workshop (UNLP 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria (online)
%@ 979-8-89176-269-5
%F kiulian-etal-2025-english
%X In this paper, we propose a model-agnostic cost-effective approach to developing bilingual base large language models (LLMs) to support English and any target language. The method includes vocabulary expansion, initialization of new embeddings, model training and evaluation. We performed our experiments with three languages, each using a non-Latin script—Ukrainian, Arabic, and Georgian.Our approach demonstrates improved language performance while reducing computational costs. It mitigates the disproportionate penalization of underrepresented languages, promoting fairness and minimizing adverse phenomena such as code-switching and broken grammar. Additionally, we introduce new metrics to evaluate language quality, revealing that vocabulary size significantly impacts the quality of generated text.
%R 10.18653/v1/2025.unlp-1.1
%U https://aclanthology.org/2025.unlp-1.1/
%U https://doi.org/10.18653/v1/2025.unlp-1.1
%P 1-13
Markdown (Informal)
[From English-Centric to Effective Bilingual: LLMs with Custom Tokenizers for Underrepresented Languages](https://aclanthology.org/2025.unlp-1.1/) (Kiulian et al., UNLP 2025)
ACL
- Artur Kiulian, Anton Polishko, Mykola Khandoga, Yevhen Kostiuk, Guillermo Gabrielli, Łukasz Gagała, Fadi Zaraket, Qusai Abu Obaida, Hrishikesh Garud, Wendy Wing Yee Mak, Dmytro Chaplynskyi, Selma Amor, and Grigol Peradze. 2025. From English-Centric to Effective Bilingual: LLMs with Custom Tokenizers for Underrepresented Languages. In Proceedings of the Fourth Ukrainian Natural Language Processing Workshop (UNLP 2025), pages 1–13, Vienna, Austria (online). Association for Computational Linguistics.