@inproceedings{chowdhury-woolf-2026-benchmarking,
title = "Benchmarking Byte-Pair Encoding Tokenizers on Different Languages with Bits per Byte",
author = "Chowdhury, Soham and
Woolf, Warren",
editor = "Huang, Kaiyu and
Mo, Fengran and
Chen, Pinzhen and
Jiang, Meng",
booktitle = "Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models ({M}e{LLM} 2026)",
month = jul,
year = "2026",
address = "San Diego, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.mellm-1.27/",
pages = "275--283",
ISBN = "979-8-89176-430-9",
abstract = "Tokenization significantly affects the cross-lingual performance of language models, yet recent tokenizer variants such as SuperBPE and MorphBPE have not been systematically evaluated across typologically diverse languages. We conduct the first extrinsic cross-language comparison of BPE, SuperBPE, and MorphBPE tokenizers on English, Mandarin, and Hungarian, using bits per byte (BPB) normalized perplexity as our metric, with vocabulary sizes of 8K, 16K, and 32K. We find that SuperBPE matches BPE for English but underperforms by 0.01{--}0.06 BPB for Hungarian and Mandarin, suggesting that cross-whitespace merging is counterproductive for non-English languages. MorphBPE performs worse than BPE across all settings, with gaps of 0.02{--}0.04 BPB at the 32K vocabulary size. These results suggest that linguistic theory alone does not guarantee practical improvements in tokenizer design, and that standard BPE remains a surprisingly effective baseline across typologically diverse languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chowdhury-woolf-2026-benchmarking">
<titleInfo>
<title>Benchmarking Byte-Pair Encoding Tokenizers on Different Languages with Bits per Byte</title>
</titleInfo>
<name type="personal">
<namePart type="given">Soham</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Warren</namePart>
<namePart type="family">Woolf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models (MeLLM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kaiyu</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fengran</namePart>
<namePart type="family">Mo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pinzhen</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meng</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-430-9</identifier>
</relatedItem>
<abstract>Tokenization significantly affects the cross-lingual performance of language models, yet recent tokenizer variants such as SuperBPE and MorphBPE have not been systematically evaluated across typologically diverse languages. We conduct the first extrinsic cross-language comparison of BPE, SuperBPE, and MorphBPE tokenizers on English, Mandarin, and Hungarian, using bits per byte (BPB) normalized perplexity as our metric, with vocabulary sizes of 8K, 16K, and 32K. We find that SuperBPE matches BPE for English but underperforms by 0.01–0.06 BPB for Hungarian and Mandarin, suggesting that cross-whitespace merging is counterproductive for non-English languages. MorphBPE performs worse than BPE across all settings, with gaps of 0.02–0.04 BPB at the 32K vocabulary size. These results suggest that linguistic theory alone does not guarantee practical improvements in tokenizer design, and that standard BPE remains a surprisingly effective baseline across typologically diverse languages.</abstract>
<identifier type="citekey">chowdhury-woolf-2026-benchmarking</identifier>
<location>
<url>https://aclanthology.org/2026.mellm-1.27/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>275</start>
<end>283</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Benchmarking Byte-Pair Encoding Tokenizers on Different Languages with Bits per Byte
%A Chowdhury, Soham
%A Woolf, Warren
%Y Huang, Kaiyu
%Y Mo, Fengran
%Y Chen, Pinzhen
%Y Jiang, Meng
%S Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models (MeLLM 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, United States
%@ 979-8-89176-430-9
%F chowdhury-woolf-2026-benchmarking
%X Tokenization significantly affects the cross-lingual performance of language models, yet recent tokenizer variants such as SuperBPE and MorphBPE have not been systematically evaluated across typologically diverse languages. We conduct the first extrinsic cross-language comparison of BPE, SuperBPE, and MorphBPE tokenizers on English, Mandarin, and Hungarian, using bits per byte (BPB) normalized perplexity as our metric, with vocabulary sizes of 8K, 16K, and 32K. We find that SuperBPE matches BPE for English but underperforms by 0.01–0.06 BPB for Hungarian and Mandarin, suggesting that cross-whitespace merging is counterproductive for non-English languages. MorphBPE performs worse than BPE across all settings, with gaps of 0.02–0.04 BPB at the 32K vocabulary size. These results suggest that linguistic theory alone does not guarantee practical improvements in tokenizer design, and that standard BPE remains a surprisingly effective baseline across typologically diverse languages.
%U https://aclanthology.org/2026.mellm-1.27/
%P 275-283
Markdown (Informal)
[Benchmarking Byte-Pair Encoding Tokenizers on Different Languages with Bits per Byte](https://aclanthology.org/2026.mellm-1.27/) (Chowdhury & Woolf, MeLLM 2026)
ACL