@inproceedings{pourmostafa-roshan-sharami-etal-2025-analysis,
title = "Analysis of Vocabulary and Subword Tokenization Settings for Optimal Fine-tuning of {MT}: A Case Study of In-domain Translation",
author = "Pourmostafa Roshan Sharami, Javad and
Shterionov, Dimitar and
Spronck, Pieter",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-1.111/",
pages = "970--979",
abstract = "The choice of vocabulary and subword (SW) tokenization has a significant impact on both training and fine-tuning of language and translation models. Fine-tuning is a common practice in optimizing a model with respect to new data. However, new data potentially introduces new words (or tokens), which, if not considered, may lead to suboptimal performance. In addition, the distribution of tokens in the new data can differ from the distribution of the original data. As such, the original SW tokenization model could be less suitable for the new data. With this work, we aim to gain better insights on the impact of SW tokenization and vocabulary generation on the performance of neural machine translation (NMT) models fine-tuned to a specific domain. To do so, we compare several strategies for SW tokenization and vocabulary generation and investigate the performance of the resulting models. Our findings show that the best way to fine-tune for domain adaptation is to consistently use both BPE and vocabulary from the in-domain data, which helps the model pick up on important domain-specific terms. At the same time, it is crucial not to lose sight of the vocabulary of the base (pre-trained) model{---}maintaining coverage of this vocabulary ensures the model keeps its general language abilities. The most successful configurations are those that introduce plenty of frequent domain terms while still retaining a substantial portion of the base model vocabulary, leading to noticeably better translation quality and adaptation, as seen in higher BLEU scores. These benefits, however, often come with greater computational costs, such as longer training times, since the model must learn more new tokens. Conversely, approaches that skip important domain terms or combine mismatched tokenization and vocabulary do not perform as well, making it clear that both domain-specific adaptation and broad vocabulary coverage matter{---}and that these gains are realized when the vocabulary preserves a good portion of the base (pre-trained) model. While using in-domain BPE and vocabulary yields the best domain adaptation, it substantially reduces out-of-domain translation quality. Hybrid configurations that combine base and domain vocabularies help balance this trade-off, maintaining broader translation capabilities alongside improved domain performance."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pourmostafa-roshan-sharami-etal-2025-analysis">
<titleInfo>
<title>Analysis of Vocabulary and Subword Tokenization Settings for Optimal Fine-tuning of MT: A Case Study of In-domain Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Javad</namePart>
<namePart type="family">Pourmostafa Roshan Sharami</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dimitar</namePart>
<namePart type="family">Shterionov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pieter</namePart>
<namePart type="family">Spronck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kunilovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Escribe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The choice of vocabulary and subword (SW) tokenization has a significant impact on both training and fine-tuning of language and translation models. Fine-tuning is a common practice in optimizing a model with respect to new data. However, new data potentially introduces new words (or tokens), which, if not considered, may lead to suboptimal performance. In addition, the distribution of tokens in the new data can differ from the distribution of the original data. As such, the original SW tokenization model could be less suitable for the new data. With this work, we aim to gain better insights on the impact of SW tokenization and vocabulary generation on the performance of neural machine translation (NMT) models fine-tuned to a specific domain. To do so, we compare several strategies for SW tokenization and vocabulary generation and investigate the performance of the resulting models. Our findings show that the best way to fine-tune for domain adaptation is to consistently use both BPE and vocabulary from the in-domain data, which helps the model pick up on important domain-specific terms. At the same time, it is crucial not to lose sight of the vocabulary of the base (pre-trained) model—maintaining coverage of this vocabulary ensures the model keeps its general language abilities. The most successful configurations are those that introduce plenty of frequent domain terms while still retaining a substantial portion of the base model vocabulary, leading to noticeably better translation quality and adaptation, as seen in higher BLEU scores. These benefits, however, often come with greater computational costs, such as longer training times, since the model must learn more new tokens. Conversely, approaches that skip important domain terms or combine mismatched tokenization and vocabulary do not perform as well, making it clear that both domain-specific adaptation and broad vocabulary coverage matter—and that these gains are realized when the vocabulary preserves a good portion of the base (pre-trained) model. While using in-domain BPE and vocabulary yields the best domain adaptation, it substantially reduces out-of-domain translation quality. Hybrid configurations that combine base and domain vocabularies help balance this trade-off, maintaining broader translation capabilities alongside improved domain performance.</abstract>
<identifier type="citekey">pourmostafa-roshan-sharami-etal-2025-analysis</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-1.111/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>970</start>
<end>979</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Analysis of Vocabulary and Subword Tokenization Settings for Optimal Fine-tuning of MT: A Case Study of In-domain Translation
%A Pourmostafa Roshan Sharami, Javad
%A Shterionov, Dimitar
%A Spronck, Pieter
%Y Angelova, Galia
%Y Kunilovskaya, Maria
%Y Escribe, Marie
%Y Mitkov, Ruslan
%S Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F pourmostafa-roshan-sharami-etal-2025-analysis
%X The choice of vocabulary and subword (SW) tokenization has a significant impact on both training and fine-tuning of language and translation models. Fine-tuning is a common practice in optimizing a model with respect to new data. However, new data potentially introduces new words (or tokens), which, if not considered, may lead to suboptimal performance. In addition, the distribution of tokens in the new data can differ from the distribution of the original data. As such, the original SW tokenization model could be less suitable for the new data. With this work, we aim to gain better insights on the impact of SW tokenization and vocabulary generation on the performance of neural machine translation (NMT) models fine-tuned to a specific domain. To do so, we compare several strategies for SW tokenization and vocabulary generation and investigate the performance of the resulting models. Our findings show that the best way to fine-tune for domain adaptation is to consistently use both BPE and vocabulary from the in-domain data, which helps the model pick up on important domain-specific terms. At the same time, it is crucial not to lose sight of the vocabulary of the base (pre-trained) model—maintaining coverage of this vocabulary ensures the model keeps its general language abilities. The most successful configurations are those that introduce plenty of frequent domain terms while still retaining a substantial portion of the base model vocabulary, leading to noticeably better translation quality and adaptation, as seen in higher BLEU scores. These benefits, however, often come with greater computational costs, such as longer training times, since the model must learn more new tokens. Conversely, approaches that skip important domain terms or combine mismatched tokenization and vocabulary do not perform as well, making it clear that both domain-specific adaptation and broad vocabulary coverage matter—and that these gains are realized when the vocabulary preserves a good portion of the base (pre-trained) model. While using in-domain BPE and vocabulary yields the best domain adaptation, it substantially reduces out-of-domain translation quality. Hybrid configurations that combine base and domain vocabularies help balance this trade-off, maintaining broader translation capabilities alongside improved domain performance.
%U https://aclanthology.org/2025.ranlp-1.111/
%P 970-979
Markdown (Informal)
[Analysis of Vocabulary and Subword Tokenization Settings for Optimal Fine-tuning of MT: A Case Study of In-domain Translation](https://aclanthology.org/2025.ranlp-1.111/) (Pourmostafa Roshan Sharami et al., RANLP 2025)
ACL