@inproceedings{varis-etal-2026-tokcollate,
title = "{T}ok{C}ollate: A Comprehensive Tool for Tokenizer Evaluation and Visualization across Languages",
author = "Vari{\v{s}}, Du{\v{s}}an and
Stephen, Abishek and
Libovick{\'y}, Jind{\v{r}}ich",
editor = "Durrett, Greg and
Jian, Ping",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 3: System Demonstrations)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-demo.41/",
pages = "418--427",
ISBN = "979-8-89176-392-0",
abstract = "Tokenization quality varies significantly across languages, contributing to disparities in LLM performance and cost for speakers of less-resourced languages {--} a phenomenon known as the ``token premium'' problem. Despite growing research interest, no existing tool provides a comprehensive intrinsic evaluation of tokenizers paired with interactive visualization. We present TokCollate (pronounced similarly to chocolate), a Python-based evaluation framework combined with a JavaScript visualization interface that addresses this gap. TokCollate implements a wide range of intrinsic metrics, including monolingual measures such as average token length and R{\'e}nyi/Shannon efficiency, and cross-lingual measures such as vocabulary overlap, Jensen-Shannon divergence, alignment-based Eflomal scores, and length ratios. It further enables analysis across language groups defined by genealogical families, scripts, geographic regions, speaker populations, and estimated data availability. TokCollate is open-source under the MIT license and available on GitHub."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="varis-etal-2026-tokcollate">
<titleInfo>
<title>TokCollate: A Comprehensive Tool for Tokenizer Evaluation and Visualization across Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dušan</namePart>
<namePart type="family">Variš</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abishek</namePart>
<namePart type="family">Stephen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jindřich</namePart>
<namePart type="family">Libovický</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Greg</namePart>
<namePart type="family">Durrett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ping</namePart>
<namePart type="family">Jian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-392-0</identifier>
</relatedItem>
<abstract>Tokenization quality varies significantly across languages, contributing to disparities in LLM performance and cost for speakers of less-resourced languages – a phenomenon known as the “token premium” problem. Despite growing research interest, no existing tool provides a comprehensive intrinsic evaluation of tokenizers paired with interactive visualization. We present TokCollate (pronounced similarly to chocolate), a Python-based evaluation framework combined with a JavaScript visualization interface that addresses this gap. TokCollate implements a wide range of intrinsic metrics, including monolingual measures such as average token length and Rényi/Shannon efficiency, and cross-lingual measures such as vocabulary overlap, Jensen-Shannon divergence, alignment-based Eflomal scores, and length ratios. It further enables analysis across language groups defined by genealogical families, scripts, geographic regions, speaker populations, and estimated data availability. TokCollate is open-source under the MIT license and available on GitHub.</abstract>
<identifier type="citekey">varis-etal-2026-tokcollate</identifier>
<location>
<url>https://aclanthology.org/2026.acl-demo.41/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>418</start>
<end>427</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TokCollate: A Comprehensive Tool for Tokenizer Evaluation and Visualization across Languages
%A Variš, Dušan
%A Stephen, Abishek
%A Libovický, Jindřich
%Y Durrett, Greg
%Y Jian, Ping
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-392-0
%F varis-etal-2026-tokcollate
%X Tokenization quality varies significantly across languages, contributing to disparities in LLM performance and cost for speakers of less-resourced languages – a phenomenon known as the “token premium” problem. Despite growing research interest, no existing tool provides a comprehensive intrinsic evaluation of tokenizers paired with interactive visualization. We present TokCollate (pronounced similarly to chocolate), a Python-based evaluation framework combined with a JavaScript visualization interface that addresses this gap. TokCollate implements a wide range of intrinsic metrics, including monolingual measures such as average token length and Rényi/Shannon efficiency, and cross-lingual measures such as vocabulary overlap, Jensen-Shannon divergence, alignment-based Eflomal scores, and length ratios. It further enables analysis across language groups defined by genealogical families, scripts, geographic regions, speaker populations, and estimated data availability. TokCollate is open-source under the MIT license and available on GitHub.
%U https://aclanthology.org/2026.acl-demo.41/
%P 418-427
Markdown (Informal)
[TokCollate: A Comprehensive Tool for Tokenizer Evaluation and Visualization across Languages](https://aclanthology.org/2026.acl-demo.41/) (Variš et al., ACL 2026)
ACL