@article{yamaguchi-etal-2026-effectively,
title = "How Can We Effectively Expand the Vocabulary of {LLM}s with 0.01{GB} of Target Language Text?",
author = "Yamaguchi, Atsuki and
Villavicencio, Aline and
Aletras, Nikolaos",
journal = "Computational Linguistics",
volume = "52",
number = "1",
month = mar,
year = "2026",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2026.cl-1.9/",
doi = "10.1162/coli.a.581",
pages = "295--330",
abstract = "Large language models (LLMs) have shown remarkable capabilities in many languages beyond English. Yet, LLMs require more inference steps when generating non-English text due to their reliance on English-centric tokenizers and vocabulary, resulting in higher usage costs to non-English speakers. Vocabulary expansion with target language tokens is a widely used cross-lingual vocabulary adaptation approach to remedy this issue. Despite its effectiveness in inference speedup, previous work on vocabulary expansion has focused on high-resource settings assuming access to a substantial amount of target language data to effectively initialize the embeddings of the new tokens and adapt the LLM to the target language. However, vocabulary expansion in low-resource settings has yet to be explored. In this article, we investigate vocabulary expansion in low-resource settings by considering embedding initialization methods and continual pre-training strategies. Through extensive experiments across typologically diverse languages, tasks, and models, we establish a set of strategies to perform vocabulary expansion for faster inference, while striving to maintain competitive downstream performance to baselines. This is achieved with only 30K sentences ({\ensuremath{\sim}}0.01GB text data) from the target language.1"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yamaguchi-etal-2026-effectively">
<titleInfo>
<title>How Can We Effectively Expand the Vocabulary of LLMs with 0.01GB of Target Language Text?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Atsuki</namePart>
<namePart type="family">Yamaguchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Villavicencio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolaos</namePart>
<namePart type="family">Aletras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Large language models (LLMs) have shown remarkable capabilities in many languages beyond English. Yet, LLMs require more inference steps when generating non-English text due to their reliance on English-centric tokenizers and vocabulary, resulting in higher usage costs to non-English speakers. Vocabulary expansion with target language tokens is a widely used cross-lingual vocabulary adaptation approach to remedy this issue. Despite its effectiveness in inference speedup, previous work on vocabulary expansion has focused on high-resource settings assuming access to a substantial amount of target language data to effectively initialize the embeddings of the new tokens and adapt the LLM to the target language. However, vocabulary expansion in low-resource settings has yet to be explored. In this article, we investigate vocabulary expansion in low-resource settings by considering embedding initialization methods and continual pre-training strategies. Through extensive experiments across typologically diverse languages, tasks, and models, we establish a set of strategies to perform vocabulary expansion for faster inference, while striving to maintain competitive downstream performance to baselines. This is achieved with only 30K sentences (\ensuremath\sim0.01GB text data) from the target language.1</abstract>
<identifier type="citekey">yamaguchi-etal-2026-effectively</identifier>
<identifier type="doi">10.1162/coli.a.581</identifier>
<location>
<url>https://aclanthology.org/2026.cl-1.9/</url>
</location>
<part>
<date>2026-03</date>
<detail type="volume"><number>52</number></detail>
<detail type="issue"><number>1</number></detail>
<extent unit="page">
<start>295</start>
<end>330</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T How Can We Effectively Expand the Vocabulary of LLMs with 0.01GB of Target Language Text?
%A Yamaguchi, Atsuki
%A Villavicencio, Aline
%A Aletras, Nikolaos
%J Computational Linguistics
%D 2026
%8 March
%V 52
%N 1
%I MIT Press
%C Cambridge, MA
%F yamaguchi-etal-2026-effectively
%X Large language models (LLMs) have shown remarkable capabilities in many languages beyond English. Yet, LLMs require more inference steps when generating non-English text due to their reliance on English-centric tokenizers and vocabulary, resulting in higher usage costs to non-English speakers. Vocabulary expansion with target language tokens is a widely used cross-lingual vocabulary adaptation approach to remedy this issue. Despite its effectiveness in inference speedup, previous work on vocabulary expansion has focused on high-resource settings assuming access to a substantial amount of target language data to effectively initialize the embeddings of the new tokens and adapt the LLM to the target language. However, vocabulary expansion in low-resource settings has yet to be explored. In this article, we investigate vocabulary expansion in low-resource settings by considering embedding initialization methods and continual pre-training strategies. Through extensive experiments across typologically diverse languages, tasks, and models, we establish a set of strategies to perform vocabulary expansion for faster inference, while striving to maintain competitive downstream performance to baselines. This is achieved with only 30K sentences (\ensuremath\sim0.01GB text data) from the target language.1
%R 10.1162/coli.a.581
%U https://aclanthology.org/2026.cl-1.9/
%U https://doi.org/10.1162/coli.a.581
%P 295-330
Markdown (Informal)
[How Can We Effectively Expand the Vocabulary of LLMs with 0.01GB of Target Language Text?](https://aclanthology.org/2026.cl-1.9/) (Yamaguchi et al., CL 2026)
ACL