@inproceedings{velayuthan-sarveswaran-2025-egalitarian,
title = "Egalitarian Language Representation in Language Models: It All Begins with Tokenizers",
author = "Velayuthan, Menan and
Sarveswaran, Kengatharaiyer",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.400/",
pages = "5987--5996",
abstract = "Tokenizers act as a bridge between human language and the latent space of language models, influencing how language is represented in these models. Despite the dominance of English-Centric (EC) Large Language Models (LLMs), tokenization methods often fail to fairly represent complex scripts like Tamil, Sinhala, and Hindi, primarily due to pre-tokenization choices. This study demonstrates that pre-tokenization has a more significant impact than tokenization algorithms on achieving egalitarian representation. To address this, we introduce an improvement to the Byte Pair Encoding (BPE) algorithm by incorporating graphemes, which we term Grapheme Pair Encoding (GPE). Our experiments show that grapheme-based character extraction outperforms byte-level tokenizers for complex scripts. We validate this approach through experiments on Tamil, Sinhala, and Hindi. The codebase and resources used in this work are publicly available at https://github.com/vmenan/tokenizers-coling2025."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="velayuthan-sarveswaran-2025-egalitarian">
<titleInfo>
<title>Egalitarian Language Representation in Language Models: It All Begins with Tokenizers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Menan</namePart>
<namePart type="family">Velayuthan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kengatharaiyer</namePart>
<namePart type="family">Sarveswaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Tokenizers act as a bridge between human language and the latent space of language models, influencing how language is represented in these models. Despite the dominance of English-Centric (EC) Large Language Models (LLMs), tokenization methods often fail to fairly represent complex scripts like Tamil, Sinhala, and Hindi, primarily due to pre-tokenization choices. This study demonstrates that pre-tokenization has a more significant impact than tokenization algorithms on achieving egalitarian representation. To address this, we introduce an improvement to the Byte Pair Encoding (BPE) algorithm by incorporating graphemes, which we term Grapheme Pair Encoding (GPE). Our experiments show that grapheme-based character extraction outperforms byte-level tokenizers for complex scripts. We validate this approach through experiments on Tamil, Sinhala, and Hindi. The codebase and resources used in this work are publicly available at https://github.com/vmenan/tokenizers-coling2025.</abstract>
<identifier type="citekey">velayuthan-sarveswaran-2025-egalitarian</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.400/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>5987</start>
<end>5996</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Egalitarian Language Representation in Language Models: It All Begins with Tokenizers
%A Velayuthan, Menan
%A Sarveswaran, Kengatharaiyer
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F velayuthan-sarveswaran-2025-egalitarian
%X Tokenizers act as a bridge between human language and the latent space of language models, influencing how language is represented in these models. Despite the dominance of English-Centric (EC) Large Language Models (LLMs), tokenization methods often fail to fairly represent complex scripts like Tamil, Sinhala, and Hindi, primarily due to pre-tokenization choices. This study demonstrates that pre-tokenization has a more significant impact than tokenization algorithms on achieving egalitarian representation. To address this, we introduce an improvement to the Byte Pair Encoding (BPE) algorithm by incorporating graphemes, which we term Grapheme Pair Encoding (GPE). Our experiments show that grapheme-based character extraction outperforms byte-level tokenizers for complex scripts. We validate this approach through experiments on Tamil, Sinhala, and Hindi. The codebase and resources used in this work are publicly available at https://github.com/vmenan/tokenizers-coling2025.
%U https://aclanthology.org/2025.coling-main.400/
%P 5987-5996
Markdown (Informal)
[Egalitarian Language Representation in Language Models: It All Begins with Tokenizers](https://aclanthology.org/2025.coling-main.400/) (Velayuthan & Sarveswaran, COLING 2025)
ACL