@inproceedings{chelombitko-komissarov-2024-specialized,
title = "Specialized Monolingual {BPE} Tokenizers for {U}ralic Languages Representation in Large Language Models",
author = "Chelombitko, Iaroslav and
Komissarov, Aleksey",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
Pirinen, Flammie and
Macias, Melany and
Crespo Avila, Mario},
booktitle = "Proceedings of the 9th International Workshop on Computational Linguistics for Uralic Languages",
month = nov,
year = "2024",
address = "Helsinki, Finland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.iwclul-1.11",
pages = "89--95",
abstract = "Large language models show significant inequality in language representation, particularly for Uralic languages. Our analysis found that existing tokenizers allocate minimal tokens to Uralic languages, highlighting this imbalance. To address this, we developed a pipeline to create clean monolingual datasets from Wikipedia articles for four Uralic languages. We trained Byte Pair Encoding (BPE) tokenizers with a vocabulary size of 256,000 tokens, though Northern Sami had only 93,187 due to limited data. Our findings revealed most tokens are unique to each language, with 8,102 shared across all four, and 25,876 shared among Estonian, Finnish, and Hungarian. Using the Compression Ratio metric, our tokenizers outperformed popular ones like LLaMA-2 and Gemma 2, reducing Finnish{'}s compression ratio from 3.41 to 1.18. These results demonstrate the importance of specialized tokenizers for underrepresented languages, improving model performance and lowering costs. By sharing our tokenizers and datasets, we provide crucial resources for further research, emphasizing the need for equitable language representation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chelombitko-komissarov-2024-specialized">
<titleInfo>
<title>Specialized Monolingual BPE Tokenizers for Uralic Languages Representation in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Iaroslav</namePart>
<namePart type="family">Chelombitko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aleksey</namePart>
<namePart type="family">Komissarov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 9th International Workshop on Computational Linguistics for Uralic Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Flammie</namePart>
<namePart type="family">Pirinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Melany</namePart>
<namePart type="family">Macias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mario</namePart>
<namePart type="family">Crespo Avila</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Helsinki, Finland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models show significant inequality in language representation, particularly for Uralic languages. Our analysis found that existing tokenizers allocate minimal tokens to Uralic languages, highlighting this imbalance. To address this, we developed a pipeline to create clean monolingual datasets from Wikipedia articles for four Uralic languages. We trained Byte Pair Encoding (BPE) tokenizers with a vocabulary size of 256,000 tokens, though Northern Sami had only 93,187 due to limited data. Our findings revealed most tokens are unique to each language, with 8,102 shared across all four, and 25,876 shared among Estonian, Finnish, and Hungarian. Using the Compression Ratio metric, our tokenizers outperformed popular ones like LLaMA-2 and Gemma 2, reducing Finnish’s compression ratio from 3.41 to 1.18. These results demonstrate the importance of specialized tokenizers for underrepresented languages, improving model performance and lowering costs. By sharing our tokenizers and datasets, we provide crucial resources for further research, emphasizing the need for equitable language representation.</abstract>
<identifier type="citekey">chelombitko-komissarov-2024-specialized</identifier>
<location>
<url>https://aclanthology.org/2024.iwclul-1.11</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>89</start>
<end>95</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Specialized Monolingual BPE Tokenizers for Uralic Languages Representation in Large Language Models
%A Chelombitko, Iaroslav
%A Komissarov, Aleksey
%Y Hämäläinen, Mika
%Y Pirinen, Flammie
%Y Macias, Melany
%Y Crespo Avila, Mario
%S Proceedings of the 9th International Workshop on Computational Linguistics for Uralic Languages
%D 2024
%8 November
%I Association for Computational Linguistics
%C Helsinki, Finland
%F chelombitko-komissarov-2024-specialized
%X Large language models show significant inequality in language representation, particularly for Uralic languages. Our analysis found that existing tokenizers allocate minimal tokens to Uralic languages, highlighting this imbalance. To address this, we developed a pipeline to create clean monolingual datasets from Wikipedia articles for four Uralic languages. We trained Byte Pair Encoding (BPE) tokenizers with a vocabulary size of 256,000 tokens, though Northern Sami had only 93,187 due to limited data. Our findings revealed most tokens are unique to each language, with 8,102 shared across all four, and 25,876 shared among Estonian, Finnish, and Hungarian. Using the Compression Ratio metric, our tokenizers outperformed popular ones like LLaMA-2 and Gemma 2, reducing Finnish’s compression ratio from 3.41 to 1.18. These results demonstrate the importance of specialized tokenizers for underrepresented languages, improving model performance and lowering costs. By sharing our tokenizers and datasets, we provide crucial resources for further research, emphasizing the need for equitable language representation.
%U https://aclanthology.org/2024.iwclul-1.11
%P 89-95
Markdown (Informal)
[Specialized Monolingual BPE Tokenizers for Uralic Languages Representation in Large Language Models](https://aclanthology.org/2024.iwclul-1.11) (Chelombitko & Komissarov, IWCLUL 2024)
ACL