@inproceedings{nozaki-etal-2025-vrcp,
title = "{VRCP}: Vocabulary Replacement Continued Pretraining for Efficient Multilingual Language Models",
author = "Nozaki, Yuta and
Nakashima, Dai and
Sato, Ryo and
Asaba, Naoki",
booktitle = "Proceedings of the Second Workshop on Scaling Up Multilingual {\&} Multi-Cultural Evaluation",
month = jan,
year = "2025",
address = "Abu Dhabi",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sumeval-2.5/",
pages = "48--59",
abstract = "Building large language models (LLMs) for non-English languages involves leveraging extensively trained English models through continued pre-training on the target language corpora. This approach harnesses the rich semantic knowledge embedded in English models, allowing superior performance compared to training from scratch. However, tokenizers not optimized for the target language may make inefficiencies in training. We propose Vocabulary Replacement Continued Pretraining (VRCP), a method that optimizes the tokenizer for the target language by replacing unique (solely available) vocabulary from the source tokenizer while maintaining the overall vocabulary size. This approach preserves the semantic knowledge of the source model while enhancing token efficiency and performance for the target language. We evaluated VRCP using the Llama-2 model on Japanese and Chinese corpora. The results show that VRCP matches the performance of vocabulary expansion methods on benchmarks and achieves superior performance in summarization tasks. Additionally, VRCP provides an optimized tokenizer that balances token efficiency, task performance, and GPU memory footprint, making it particularly suitable for resource-constrained environments."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nozaki-etal-2025-vrcp">
<titleInfo>
<title>VRCP: Vocabulary Replacement Continued Pretraining for Efficient Multilingual Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuta</namePart>
<namePart type="family">Nozaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dai</namePart>
<namePart type="family">Nakashima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryo</namePart>
<namePart type="family">Sato</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoki</namePart>
<namePart type="family">Asaba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Scaling Up Multilingual & Multi-Cultural Evaluation</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Building large language models (LLMs) for non-English languages involves leveraging extensively trained English models through continued pre-training on the target language corpora. This approach harnesses the rich semantic knowledge embedded in English models, allowing superior performance compared to training from scratch. However, tokenizers not optimized for the target language may make inefficiencies in training. We propose Vocabulary Replacement Continued Pretraining (VRCP), a method that optimizes the tokenizer for the target language by replacing unique (solely available) vocabulary from the source tokenizer while maintaining the overall vocabulary size. This approach preserves the semantic knowledge of the source model while enhancing token efficiency and performance for the target language. We evaluated VRCP using the Llama-2 model on Japanese and Chinese corpora. The results show that VRCP matches the performance of vocabulary expansion methods on benchmarks and achieves superior performance in summarization tasks. Additionally, VRCP provides an optimized tokenizer that balances token efficiency, task performance, and GPU memory footprint, making it particularly suitable for resource-constrained environments.</abstract>
<identifier type="citekey">nozaki-etal-2025-vrcp</identifier>
<location>
<url>https://aclanthology.org/2025.sumeval-2.5/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>48</start>
<end>59</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T VRCP: Vocabulary Replacement Continued Pretraining for Efficient Multilingual Language Models
%A Nozaki, Yuta
%A Nakashima, Dai
%A Sato, Ryo
%A Asaba, Naoki
%S Proceedings of the Second Workshop on Scaling Up Multilingual & Multi-Cultural Evaluation
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi
%F nozaki-etal-2025-vrcp
%X Building large language models (LLMs) for non-English languages involves leveraging extensively trained English models through continued pre-training on the target language corpora. This approach harnesses the rich semantic knowledge embedded in English models, allowing superior performance compared to training from scratch. However, tokenizers not optimized for the target language may make inefficiencies in training. We propose Vocabulary Replacement Continued Pretraining (VRCP), a method that optimizes the tokenizer for the target language by replacing unique (solely available) vocabulary from the source tokenizer while maintaining the overall vocabulary size. This approach preserves the semantic knowledge of the source model while enhancing token efficiency and performance for the target language. We evaluated VRCP using the Llama-2 model on Japanese and Chinese corpora. The results show that VRCP matches the performance of vocabulary expansion methods on benchmarks and achieves superior performance in summarization tasks. Additionally, VRCP provides an optimized tokenizer that balances token efficiency, task performance, and GPU memory footprint, making it particularly suitable for resource-constrained environments.
%U https://aclanthology.org/2025.sumeval-2.5/
%P 48-59
Markdown (Informal)
[VRCP: Vocabulary Replacement Continued Pretraining for Efficient Multilingual Language Models](https://aclanthology.org/2025.sumeval-2.5/) (Nozaki et al., SUMEval 2025)
ACL