@inproceedings{lesci-etal-2025-causal,
title = "Causal Estimation of Tokenisation Bias",
author = "Lesci, Pietro and
Meister, Clara and
Hofmann, Thomas and
Vlachos, Andreas and
Pimentel, Tiago",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1374/",
doi = "10.18653/v1/2025.acl-long.1374",
pages = "28325--28340",
ISBN = "979-8-89176-251-0",
abstract = "Modern language models are typically trained over subword sequences, but ultimately define probabilities over character-strings. Ideally, the choice of the tokeniser{---}which maps character-strings to subwords{---}should not affect the probability assigned to the underlying character-string; in practice, it does. We define this mismatch as **tokenisation bias**. In this work, we quantify one particular type of tokenisation bias: the effect of including or not a subword (e.g., $\langle$ hello $\rangle$) in a tokeniser{'}s vocabulary on the probability a trained model assigns to the corresponding characters (i.e., ``hello''). Estimating this effect is challenging because each model is trained with only one tokeniser. We address this by framing tokenisation bias as a causal effect and estimating it using the regression discontinuity design. Specifically, we exploit the fact that tokenisation algorithms rank subwords and add the first $K$ to a tokeniser{'}s vocabulary, where $K$ is an arbitrary cutoff point. As such, we can estimate a causal effect by comparing similar subwords around this cutoff. Experimentally, we find that tokenisation consistently affects models' outputs across scales, vocabularies, and tokenisers. Notably, a subword{'}s presence in a small model{'}s vocabulary may increase its characters' probability by up to 17 times, highlighting tokenisation as a key design choice in language modelling."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lesci-etal-2025-causal">
<titleInfo>
<title>Causal Estimation of Tokenisation Bias</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pietro</namePart>
<namePart type="family">Lesci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clara</namePart>
<namePart type="family">Meister</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Hofmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tiago</namePart>
<namePart type="family">Pimentel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Modern language models are typically trained over subword sequences, but ultimately define probabilities over character-strings. Ideally, the choice of the tokeniser—which maps character-strings to subwords—should not affect the probability assigned to the underlying character-string; in practice, it does. We define this mismatch as **tokenisation bias**. In this work, we quantify one particular type of tokenisation bias: the effect of including or not a subword (e.g., łangle hello ångle) in a tokeniser’s vocabulary on the probability a trained model assigns to the corresponding characters (i.e., “hello”). Estimating this effect is challenging because each model is trained with only one tokeniser. We address this by framing tokenisation bias as a causal effect and estimating it using the regression discontinuity design. Specifically, we exploit the fact that tokenisation algorithms rank subwords and add the first K to a tokeniser’s vocabulary, where K is an arbitrary cutoff point. As such, we can estimate a causal effect by comparing similar subwords around this cutoff. Experimentally, we find that tokenisation consistently affects models’ outputs across scales, vocabularies, and tokenisers. Notably, a subword’s presence in a small model’s vocabulary may increase its characters’ probability by up to 17 times, highlighting tokenisation as a key design choice in language modelling.</abstract>
<identifier type="citekey">lesci-etal-2025-causal</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1374</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1374/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>28325</start>
<end>28340</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Causal Estimation of Tokenisation Bias
%A Lesci, Pietro
%A Meister, Clara
%A Hofmann, Thomas
%A Vlachos, Andreas
%A Pimentel, Tiago
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F lesci-etal-2025-causal
%X Modern language models are typically trained over subword sequences, but ultimately define probabilities over character-strings. Ideally, the choice of the tokeniser—which maps character-strings to subwords—should not affect the probability assigned to the underlying character-string; in practice, it does. We define this mismatch as **tokenisation bias**. In this work, we quantify one particular type of tokenisation bias: the effect of including or not a subword (e.g., łangle hello ångle) in a tokeniser’s vocabulary on the probability a trained model assigns to the corresponding characters (i.e., “hello”). Estimating this effect is challenging because each model is trained with only one tokeniser. We address this by framing tokenisation bias as a causal effect and estimating it using the regression discontinuity design. Specifically, we exploit the fact that tokenisation algorithms rank subwords and add the first K to a tokeniser’s vocabulary, where K is an arbitrary cutoff point. As such, we can estimate a causal effect by comparing similar subwords around this cutoff. Experimentally, we find that tokenisation consistently affects models’ outputs across scales, vocabularies, and tokenisers. Notably, a subword’s presence in a small model’s vocabulary may increase its characters’ probability by up to 17 times, highlighting tokenisation as a key design choice in language modelling.
%R 10.18653/v1/2025.acl-long.1374
%U https://aclanthology.org/2025.acl-long.1374/
%U https://doi.org/10.18653/v1/2025.acl-long.1374
%P 28325-28340
Markdown (Informal)
[Causal Estimation of Tokenisation Bias](https://aclanthology.org/2025.acl-long.1374/) (Lesci et al., ACL 2025)
ACL
- Pietro Lesci, Clara Meister, Thomas Hofmann, Andreas Vlachos, and Tiago Pimentel. 2025. Causal Estimation of Tokenisation Bias. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 28325–28340, Vienna, Austria. Association for Computational Linguistics.