@inproceedings{kanjirangat-etal-2026-evaluating,
title = "Evaluating Multilingual Tokenization under Worst-N Parity-Aware {BPE}",
author = "Kanjirangat, Vani and
Kletz, David and
Samardzic, Tanja and
Dolamic, Ljiljana and
Rinaldi, Fabio",
editor = "Huang, Kaiyu and
Mo, Fengran and
Chen, Pinzhen and
Jiang, Meng",
booktitle = "Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models ({M}e{LLM} 2026)",
month = jul,
year = "2026",
address = "San Diego, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.mellm-1.21/",
pages = "221--228",
ISBN = "979-8-89176-430-9",
abstract = "Improving the fairness of a language model is a goal that applies at every level of the model. In this paper, we evaluate a method targeting a foundational level: tokenization.We present a multilingual evaluation of parity-aware tokenization under worst-$N$ optimization, extending PA-BPE to jointly optimize over the $N$ worst-compressed languages.We evaluate this formulation for $N > 1$ across vocabulary sizes of 16K and 32K on the languages from the flores+ benchmark, using metrics that capture both efficiency and structural alignment.Our results reveal that the effects of increasing $N$ are inconsistent across metrics and do not lead to major gains. Efficiency-oriented and boundary-level metrics show a modest tendency to improve at higher values of $N$, while structural alignment metrics (such as AST alignment and boundary crossing) exhibit no clear pattern, suggesting that compression fairness and linguistic structure are mainly orthogonal objectives. Script-level analysis further reveals uneven effects across writing systems, with several non-Latin scripts showing greater sensitivity to increasing $N$."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kanjirangat-etal-2026-evaluating">
<titleInfo>
<title>Evaluating Multilingual Tokenization under Worst-N Parity-Aware BPE</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vani</namePart>
<namePart type="family">Kanjirangat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Kletz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanja</namePart>
<namePart type="family">Samardzic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ljiljana</namePart>
<namePart type="family">Dolamic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabio</namePart>
<namePart type="family">Rinaldi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models (MeLLM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kaiyu</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fengran</namePart>
<namePart type="family">Mo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pinzhen</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meng</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-430-9</identifier>
</relatedItem>
<abstract>Improving the fairness of a language model is a goal that applies at every level of the model. In this paper, we evaluate a method targeting a foundational level: tokenization.We present a multilingual evaluation of parity-aware tokenization under worst-N optimization, extending PA-BPE to jointly optimize over the N worst-compressed languages.We evaluate this formulation for N > 1 across vocabulary sizes of 16K and 32K on the languages from the flores+ benchmark, using metrics that capture both efficiency and structural alignment.Our results reveal that the effects of increasing N are inconsistent across metrics and do not lead to major gains. Efficiency-oriented and boundary-level metrics show a modest tendency to improve at higher values of N, while structural alignment metrics (such as AST alignment and boundary crossing) exhibit no clear pattern, suggesting that compression fairness and linguistic structure are mainly orthogonal objectives. Script-level analysis further reveals uneven effects across writing systems, with several non-Latin scripts showing greater sensitivity to increasing N.</abstract>
<identifier type="citekey">kanjirangat-etal-2026-evaluating</identifier>
<location>
<url>https://aclanthology.org/2026.mellm-1.21/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>221</start>
<end>228</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Multilingual Tokenization under Worst-N Parity-Aware BPE
%A Kanjirangat, Vani
%A Kletz, David
%A Samardzic, Tanja
%A Dolamic, Ljiljana
%A Rinaldi, Fabio
%Y Huang, Kaiyu
%Y Mo, Fengran
%Y Chen, Pinzhen
%Y Jiang, Meng
%S Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models (MeLLM 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, United States
%@ 979-8-89176-430-9
%F kanjirangat-etal-2026-evaluating
%X Improving the fairness of a language model is a goal that applies at every level of the model. In this paper, we evaluate a method targeting a foundational level: tokenization.We present a multilingual evaluation of parity-aware tokenization under worst-N optimization, extending PA-BPE to jointly optimize over the N worst-compressed languages.We evaluate this formulation for N > 1 across vocabulary sizes of 16K and 32K on the languages from the flores+ benchmark, using metrics that capture both efficiency and structural alignment.Our results reveal that the effects of increasing N are inconsistent across metrics and do not lead to major gains. Efficiency-oriented and boundary-level metrics show a modest tendency to improve at higher values of N, while structural alignment metrics (such as AST alignment and boundary crossing) exhibit no clear pattern, suggesting that compression fairness and linguistic structure are mainly orthogonal objectives. Script-level analysis further reveals uneven effects across writing systems, with several non-Latin scripts showing greater sensitivity to increasing N.
%U https://aclanthology.org/2026.mellm-1.21/
%P 221-228
Markdown (Informal)
[Evaluating Multilingual Tokenization under Worst-N Parity-Aware BPE](https://aclanthology.org/2026.mellm-1.21/) (Kanjirangat et al., MeLLM 2026)
ACL
- Vani Kanjirangat, David Kletz, Tanja Samardzic, Ljiljana Dolamic, and Fabio Rinaldi. 2026. Evaluating Multilingual Tokenization under Worst-N Parity-Aware BPE. In Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models (MeLLM 2026), pages 221–228, San Diego, United States. Association for Computational Linguistics.