@inproceedings{andryushchenko-ivanov-2025-evaluating,
title = "Evaluating Tokenizer Adaptation Methods for Large Language Models on Low-Resource Programming Languages",
author = "Andryushchenko, Georgy and
Ivanov, Vladimir V.",
editor = "Zhao, Jin and
Wang, Mingyang and
Liu, Zhu",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-srw.57/",
doi = "10.18653/v1/2025.acl-srw.57",
pages = "823--833",
ISBN = "979-8-89176-254-1",
abstract = "Large language models (LLMs), which are primarily trained on high-resource programming languages (HRPLs), tend to perform sub-optimally for low-resource programming languages (LRPLs). This study investigates the impact of tokenizer adaptation methods on improving code generation for LRPLs. StarCoder 2 and DeepSeek-Coder models adapted to Elixir and Racket using methods such as Fast Vocabulary Transfer (FVT), FOCUS, and Zero-shot Tokenizer Transfer (ZeTT) are evaluated and compared with the original and fine-tuned models. Our experiments reveal that ZeTT outperforms other methods, achieving significant improvements in handling syntax, program logic, and data types for LRPLs. However, we also highlight performance declines in non-target languages like Python after tokenizer adaptation. The study approves the positive impact of tokenizer adaptation in enhancing LRPL code generation and suggests directions for future research, including token embeddings improvement."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="andryushchenko-ivanov-2025-evaluating">
<titleInfo>
<title>Evaluating Tokenizer Adaptation Methods for Large Language Models on Low-Resource Programming Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Georgy</namePart>
<namePart type="family">Andryushchenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vladimir</namePart>
<namePart type="given">V</namePart>
<namePart type="family">Ivanov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jin</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingyang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-254-1</identifier>
</relatedItem>
<abstract>Large language models (LLMs), which are primarily trained on high-resource programming languages (HRPLs), tend to perform sub-optimally for low-resource programming languages (LRPLs). This study investigates the impact of tokenizer adaptation methods on improving code generation for LRPLs. StarCoder 2 and DeepSeek-Coder models adapted to Elixir and Racket using methods such as Fast Vocabulary Transfer (FVT), FOCUS, and Zero-shot Tokenizer Transfer (ZeTT) are evaluated and compared with the original and fine-tuned models. Our experiments reveal that ZeTT outperforms other methods, achieving significant improvements in handling syntax, program logic, and data types for LRPLs. However, we also highlight performance declines in non-target languages like Python after tokenizer adaptation. The study approves the positive impact of tokenizer adaptation in enhancing LRPL code generation and suggests directions for future research, including token embeddings improvement.</abstract>
<identifier type="citekey">andryushchenko-ivanov-2025-evaluating</identifier>
<identifier type="doi">10.18653/v1/2025.acl-srw.57</identifier>
<location>
<url>https://aclanthology.org/2025.acl-srw.57/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>823</start>
<end>833</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Tokenizer Adaptation Methods for Large Language Models on Low-Resource Programming Languages
%A Andryushchenko, Georgy
%A Ivanov, Vladimir V.
%Y Zhao, Jin
%Y Wang, Mingyang
%Y Liu, Zhu
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-254-1
%F andryushchenko-ivanov-2025-evaluating
%X Large language models (LLMs), which are primarily trained on high-resource programming languages (HRPLs), tend to perform sub-optimally for low-resource programming languages (LRPLs). This study investigates the impact of tokenizer adaptation methods on improving code generation for LRPLs. StarCoder 2 and DeepSeek-Coder models adapted to Elixir and Racket using methods such as Fast Vocabulary Transfer (FVT), FOCUS, and Zero-shot Tokenizer Transfer (ZeTT) are evaluated and compared with the original and fine-tuned models. Our experiments reveal that ZeTT outperforms other methods, achieving significant improvements in handling syntax, program logic, and data types for LRPLs. However, we also highlight performance declines in non-target languages like Python after tokenizer adaptation. The study approves the positive impact of tokenizer adaptation in enhancing LRPL code generation and suggests directions for future research, including token embeddings improvement.
%R 10.18653/v1/2025.acl-srw.57
%U https://aclanthology.org/2025.acl-srw.57/
%U https://doi.org/10.18653/v1/2025.acl-srw.57
%P 823-833
Markdown (Informal)
[Evaluating Tokenizer Adaptation Methods for Large Language Models on Low-Resource Programming Languages](https://aclanthology.org/2025.acl-srw.57/) (Andryushchenko & Ivanov, ACL 2025)
ACL