@inproceedings{lundin-etal-2026-token,
title = "The Token Tax: Systematic Bias in Multilingual Tokenization",
author = "Lundin, Jessica M. and
Zhang, Ada and
Karim, Nihal and
Louzan, Hamza and
Wei, Guohao and
Adelani, David Ifeoluwa and
Carroll, Cody",
editor = "Chimoto, Everlyn Asiko and
Lignos, Constantine and
Muhammad, Shamsuddeen and
Abdulmumin, Idris and
Siro, Clemencia and
Adelani, David Ifeoluwa",
booktitle = "Proceedings of the 7th Workshop on {A}frican Natural Language Processing ({A}frica{NLP} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.africanlp-main.10/",
pages = "103--112",
ISBN = "979-8-89176-364-7",
abstract = "Tokenization inefficiency is associated with structural disadvantages on morphologically complex, low-resource languages, inflating compute resources and reducing accuracy. We evaluate 10 Large Language Models (LLMs) on AfriMMLU (5 subjects; 16 African languages) and show that token fertility reliably predicts accuracy. Higher fertility consistently predicts lower accuracy across all models and subjects. We further find that reasoning models (e.g., DeepSeek, o1) consistently outperform non-reasoning peers across high- and low-resource languages in the AfriMMLU dataset, narrowing accuracy gaps observed in prior generations. In terms of economics, a doubling in tokens results in quadrupled training cost and time, underscoring the ``token tax'' faced by many languages. These results motivate morphologically aware tokenization, fair pricing, and multilingual benchmarks for equitable natural language processing (NLP)."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lundin-etal-2026-token">
<titleInfo>
<title>The Token Tax: Systematic Bias in Multilingual Tokenization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jessica</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Lundin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ada</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nihal</namePart>
<namePart type="family">Karim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamza</namePart>
<namePart type="family">Louzan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guohao</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">Ifeoluwa</namePart>
<namePart type="family">Adelani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cody</namePart>
<namePart type="family">Carroll</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Workshop on African Natural Language Processing (AfricaNLP 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Everlyn</namePart>
<namePart type="given">Asiko</namePart>
<namePart type="family">Chimoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Constantine</namePart>
<namePart type="family">Lignos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shamsuddeen</namePart>
<namePart type="family">Muhammad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Idris</namePart>
<namePart type="family">Abdulmumin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clemencia</namePart>
<namePart type="family">Siro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">Ifeoluwa</namePart>
<namePart type="family">Adelani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-364-7</identifier>
</relatedItem>
<abstract>Tokenization inefficiency is associated with structural disadvantages on morphologically complex, low-resource languages, inflating compute resources and reducing accuracy. We evaluate 10 Large Language Models (LLMs) on AfriMMLU (5 subjects; 16 African languages) and show that token fertility reliably predicts accuracy. Higher fertility consistently predicts lower accuracy across all models and subjects. We further find that reasoning models (e.g., DeepSeek, o1) consistently outperform non-reasoning peers across high- and low-resource languages in the AfriMMLU dataset, narrowing accuracy gaps observed in prior generations. In terms of economics, a doubling in tokens results in quadrupled training cost and time, underscoring the “token tax” faced by many languages. These results motivate morphologically aware tokenization, fair pricing, and multilingual benchmarks for equitable natural language processing (NLP).</abstract>
<identifier type="citekey">lundin-etal-2026-token</identifier>
<location>
<url>https://aclanthology.org/2026.africanlp-main.10/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>103</start>
<end>112</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Token Tax: Systematic Bias in Multilingual Tokenization
%A Lundin, Jessica M.
%A Zhang, Ada
%A Karim, Nihal
%A Louzan, Hamza
%A Wei, Guohao
%A Adelani, David Ifeoluwa
%A Carroll, Cody
%Y Chimoto, Everlyn Asiko
%Y Lignos, Constantine
%Y Muhammad, Shamsuddeen
%Y Abdulmumin, Idris
%Y Siro, Clemencia
%Y Adelani, David Ifeoluwa
%S Proceedings of the 7th Workshop on African Natural Language Processing (AfricaNLP 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-364-7
%F lundin-etal-2026-token
%X Tokenization inefficiency is associated with structural disadvantages on morphologically complex, low-resource languages, inflating compute resources and reducing accuracy. We evaluate 10 Large Language Models (LLMs) on AfriMMLU (5 subjects; 16 African languages) and show that token fertility reliably predicts accuracy. Higher fertility consistently predicts lower accuracy across all models and subjects. We further find that reasoning models (e.g., DeepSeek, o1) consistently outperform non-reasoning peers across high- and low-resource languages in the AfriMMLU dataset, narrowing accuracy gaps observed in prior generations. In terms of economics, a doubling in tokens results in quadrupled training cost and time, underscoring the “token tax” faced by many languages. These results motivate morphologically aware tokenization, fair pricing, and multilingual benchmarks for equitable natural language processing (NLP).
%U https://aclanthology.org/2026.africanlp-main.10/
%P 103-112
Markdown (Informal)
[The Token Tax: Systematic Bias in Multilingual Tokenization](https://aclanthology.org/2026.africanlp-main.10/) (Lundin et al., AfricaNLP 2026)
ACL
- Jessica M. Lundin, Ada Zhang, Nihal Karim, Hamza Louzan, Guohao Wei, David Ifeoluwa Adelani, and Cody Carroll. 2026. The Token Tax: Systematic Bias in Multilingual Tokenization. In Proceedings of the 7th Workshop on African Natural Language Processing (AfricaNLP 2026), pages 103–112, Rabat, Morocco. Association for Computational Linguistics.