@inproceedings{gren-kurfali-2026-efficient,
title = "Efficient Low-Resource Language Models Using Tokenizer Transfer",
author = "Gren, Gustaf and
Kurfali, Murathan",
editor = "Baez Santamaria, Selene and
Somayajula, Sai Ashish and
Yamaguchi, Atsuki",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 4: Student Research Workshop)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-srw.49/",
pages = "639--648",
ISBN = "979-8-89176-383-8",
abstract = "Training a language model for low-resource languages is challenging due to data scarcity and computational cost. Tokenizer transfer offers a way to adapt a pre-trained model to a new tokenizer without full retraining, improving efficiency and cross-lingual applicability. To the best our of knowledge, we present the first controlled evaluation of tokenizer transfer on monolingually pretrained base models trained on language-specific corpora, Orthogonal Mapping Pursuit (OMP) and Fast Vocabulary Transfer (FVT), across six languages and multiple finetuning regimes. Using the Goldfish model family, we evaluate using byte-normalized log-perplexity and MultiBlimp accuracy for target-language adaptability, source-language retention, and the interaction between transfer and monolingual or mixed finetuning. OMP with monolingual target finetuning yields the best target-language scores (lower log-perplexity and higher MultiBlimp) among our evaluated conditions, compared with (i) a model trained only on the source language, (ii) a model trained on a smaller amount of target-language data, and (iii) the source language model adapted via standard finetuning on the target data. The results suggest tokenizer transfer is a compute-efficient alternative for low-resource LM training: train a monolingual tokenizer for the target language, transfer it to a larger pre-trained model, and fine-tune using the target data."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gren-kurfali-2026-efficient">
<titleInfo>
<title>Efficient Low-Resource Language Models Using Tokenizer Transfer</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gustaf</namePart>
<namePart type="family">Gren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Murathan</namePart>
<namePart type="family">Kurfali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Selene</namePart>
<namePart type="family">Baez Santamaria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sai</namePart>
<namePart type="given">Ashish</namePart>
<namePart type="family">Somayajula</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atsuki</namePart>
<namePart type="family">Yamaguchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-383-8</identifier>
</relatedItem>
<abstract>Training a language model for low-resource languages is challenging due to data scarcity and computational cost. Tokenizer transfer offers a way to adapt a pre-trained model to a new tokenizer without full retraining, improving efficiency and cross-lingual applicability. To the best our of knowledge, we present the first controlled evaluation of tokenizer transfer on monolingually pretrained base models trained on language-specific corpora, Orthogonal Mapping Pursuit (OMP) and Fast Vocabulary Transfer (FVT), across six languages and multiple finetuning regimes. Using the Goldfish model family, we evaluate using byte-normalized log-perplexity and MultiBlimp accuracy for target-language adaptability, source-language retention, and the interaction between transfer and monolingual or mixed finetuning. OMP with monolingual target finetuning yields the best target-language scores (lower log-perplexity and higher MultiBlimp) among our evaluated conditions, compared with (i) a model trained only on the source language, (ii) a model trained on a smaller amount of target-language data, and (iii) the source language model adapted via standard finetuning on the target data. The results suggest tokenizer transfer is a compute-efficient alternative for low-resource LM training: train a monolingual tokenizer for the target language, transfer it to a larger pre-trained model, and fine-tune using the target data.</abstract>
<identifier type="citekey">gren-kurfali-2026-efficient</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-srw.49/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>639</start>
<end>648</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Efficient Low-Resource Language Models Using Tokenizer Transfer
%A Gren, Gustaf
%A Kurfali, Murathan
%Y Baez Santamaria, Selene
%Y Somayajula, Sai Ashish
%Y Yamaguchi, Atsuki
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 4: Student Research Workshop)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-383-8
%F gren-kurfali-2026-efficient
%X Training a language model for low-resource languages is challenging due to data scarcity and computational cost. Tokenizer transfer offers a way to adapt a pre-trained model to a new tokenizer without full retraining, improving efficiency and cross-lingual applicability. To the best our of knowledge, we present the first controlled evaluation of tokenizer transfer on monolingually pretrained base models trained on language-specific corpora, Orthogonal Mapping Pursuit (OMP) and Fast Vocabulary Transfer (FVT), across six languages and multiple finetuning regimes. Using the Goldfish model family, we evaluate using byte-normalized log-perplexity and MultiBlimp accuracy for target-language adaptability, source-language retention, and the interaction between transfer and monolingual or mixed finetuning. OMP with monolingual target finetuning yields the best target-language scores (lower log-perplexity and higher MultiBlimp) among our evaluated conditions, compared with (i) a model trained only on the source language, (ii) a model trained on a smaller amount of target-language data, and (iii) the source language model adapted via standard finetuning on the target data. The results suggest tokenizer transfer is a compute-efficient alternative for low-resource LM training: train a monolingual tokenizer for the target language, transfer it to a larger pre-trained model, and fine-tune using the target data.
%U https://aclanthology.org/2026.eacl-srw.49/
%P 639-648
Markdown (Informal)
[Efficient Low-Resource Language Models Using Tokenizer Transfer](https://aclanthology.org/2026.eacl-srw.49/) (Gren & Kurfali, EACL 2026)
ACL
- Gustaf Gren and Murathan Kurfali. 2026. Efficient Low-Resource Language Models Using Tokenizer Transfer. In Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 4: Student Research Workshop), pages 639–648, Rabat, Morocco. Association for Computational Linguistics.