@inproceedings{lin-etal-2025-rethinking,
title = "Rethinking Vocabulary Augmentation: Addressing the Challenges of Low-Resource Languages in Multilingual Models",
author = "Lin, Nankai and
Zeng, Peijian and
Zheng, Weixiong and
Jiang, Shengyi and
Zhou, Dong and
Yang, Aimin",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.197/",
pages = "2919--2934",
abstract = "The performance of multilingual language models (MLLMs) is notably inferior for low-resource languages (LRL) compared to high-resource ones, primarily due to the limited available corpus during the pre-training phase. This inadequacy stems from the under-representation of low-resource language words in the subword vocabularies of MLLMs, leading to their misidentification as unknown or incorrectly concatenated subwords. Previous approaches are based on frequency sorting to select words for augmenting vocabularies. However, these methods overlook the fundamental disparities between model representation distributions and frequency distributions. To address this gap, we introduce a novel Entropy-Consistency Word Selection (ECWS) method, which integrates semantic and frequency metrics for vocabulary augmentation. Our results indicate an improvement in performance, supporting our approach as a viable means to enrich vocabularies inadequately represented in current MLLMs."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lin-etal-2025-rethinking">
<titleInfo>
<title>Rethinking Vocabulary Augmentation: Addressing the Challenges of Low-Resource Languages in Multilingual Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nankai</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peijian</namePart>
<namePart type="family">Zeng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weixiong</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shengyi</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dong</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aimin</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The performance of multilingual language models (MLLMs) is notably inferior for low-resource languages (LRL) compared to high-resource ones, primarily due to the limited available corpus during the pre-training phase. This inadequacy stems from the under-representation of low-resource language words in the subword vocabularies of MLLMs, leading to their misidentification as unknown or incorrectly concatenated subwords. Previous approaches are based on frequency sorting to select words for augmenting vocabularies. However, these methods overlook the fundamental disparities between model representation distributions and frequency distributions. To address this gap, we introduce a novel Entropy-Consistency Word Selection (ECWS) method, which integrates semantic and frequency metrics for vocabulary augmentation. Our results indicate an improvement in performance, supporting our approach as a viable means to enrich vocabularies inadequately represented in current MLLMs.</abstract>
<identifier type="citekey">lin-etal-2025-rethinking</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.197/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>2919</start>
<end>2934</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Rethinking Vocabulary Augmentation: Addressing the Challenges of Low-Resource Languages in Multilingual Models
%A Lin, Nankai
%A Zeng, Peijian
%A Zheng, Weixiong
%A Jiang, Shengyi
%A Zhou, Dong
%A Yang, Aimin
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F lin-etal-2025-rethinking
%X The performance of multilingual language models (MLLMs) is notably inferior for low-resource languages (LRL) compared to high-resource ones, primarily due to the limited available corpus during the pre-training phase. This inadequacy stems from the under-representation of low-resource language words in the subword vocabularies of MLLMs, leading to their misidentification as unknown or incorrectly concatenated subwords. Previous approaches are based on frequency sorting to select words for augmenting vocabularies. However, these methods overlook the fundamental disparities between model representation distributions and frequency distributions. To address this gap, we introduce a novel Entropy-Consistency Word Selection (ECWS) method, which integrates semantic and frequency metrics for vocabulary augmentation. Our results indicate an improvement in performance, supporting our approach as a viable means to enrich vocabularies inadequately represented in current MLLMs.
%U https://aclanthology.org/2025.coling-main.197/
%P 2919-2934
Markdown (Informal)
[Rethinking Vocabulary Augmentation: Addressing the Challenges of Low-Resource Languages in Multilingual Models](https://aclanthology.org/2025.coling-main.197/) (Lin et al., COLING 2025)
ACL