@inproceedings{staffini-2026-tokenization,
title = "Tokenization Cost, Retention, and Orthography Robustness for {L}adin and {I}talian Varieties",
author = "Staffini, Alessio",
editor = "Hettiarachchi, Hansi and
Ranasinghe, Tharindu and
Plum, Alistair and
Rayson, Paul and
Mitkov, Ruslan and
Gaber, Mohamed and
Premasiri, Damith and
Tan, Fiona Anting and
Uyangodage, Lasitha",
booktitle = "Proceedings of the Second Workshop on Language Models for Low-Resource Languages ({L}o{R}es{LM} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.loreslm-1.49/",
pages = "570--583",
ISBN = "979-8-89176-377-7",
abstract = "Tokenizer mismatch is a practical bottleneck for low-resource language varieties: when text is fragmented into disproportionately many subwords or bytes, it wastes context, increases truncation, and can be brittle to orthographic variation.We present a lightweight and reproducible audit centered on Ladin and evaluated on the Identification of Languages and Dialects ofItaly benchmark of eleven Italian varieties.Our diagnostic suite combines tokenization cost measures (tokens per word, truncation pressure, bytes per token) with retention indicators (word split rate, continued-token rate, and type-level retention) and fragmentation proxies that reveal splitting patterns beyond fertility.We pair these diagnostics with a conservative orthography robustness protocol (diacritics, casing, punctuation and dash normalization) and assess how diagnostic changes relate to performance drops in lightweight baselines for sentence-level variety identification.We release code and derived statistics to support reproducible tokenizer audits in other low-resource settings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="staffini-2026-tokenization">
<titleInfo>
<title>Tokenization Cost, Retention, and Orthography Robustness for Ladin and Italian Varieties</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alessio</namePart>
<namePart type="family">Staffini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hansi</namePart>
<namePart type="family">Hettiarachchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Ranasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alistair</namePart>
<namePart type="family">Plum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Gaber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Damith</namePart>
<namePart type="family">Premasiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fiona</namePart>
<namePart type="given">Anting</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lasitha</namePart>
<namePart type="family">Uyangodage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-377-7</identifier>
</relatedItem>
<abstract>Tokenizer mismatch is a practical bottleneck for low-resource language varieties: when text is fragmented into disproportionately many subwords or bytes, it wastes context, increases truncation, and can be brittle to orthographic variation.We present a lightweight and reproducible audit centered on Ladin and evaluated on the Identification of Languages and Dialects ofItaly benchmark of eleven Italian varieties.Our diagnostic suite combines tokenization cost measures (tokens per word, truncation pressure, bytes per token) with retention indicators (word split rate, continued-token rate, and type-level retention) and fragmentation proxies that reveal splitting patterns beyond fertility.We pair these diagnostics with a conservative orthography robustness protocol (diacritics, casing, punctuation and dash normalization) and assess how diagnostic changes relate to performance drops in lightweight baselines for sentence-level variety identification.We release code and derived statistics to support reproducible tokenizer audits in other low-resource settings.</abstract>
<identifier type="citekey">staffini-2026-tokenization</identifier>
<location>
<url>https://aclanthology.org/2026.loreslm-1.49/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>570</start>
<end>583</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tokenization Cost, Retention, and Orthography Robustness for Ladin and Italian Varieties
%A Staffini, Alessio
%Y Hettiarachchi, Hansi
%Y Ranasinghe, Tharindu
%Y Plum, Alistair
%Y Rayson, Paul
%Y Mitkov, Ruslan
%Y Gaber, Mohamed
%Y Premasiri, Damith
%Y Tan, Fiona Anting
%Y Uyangodage, Lasitha
%S Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-377-7
%F staffini-2026-tokenization
%X Tokenizer mismatch is a practical bottleneck for low-resource language varieties: when text is fragmented into disproportionately many subwords or bytes, it wastes context, increases truncation, and can be brittle to orthographic variation.We present a lightweight and reproducible audit centered on Ladin and evaluated on the Identification of Languages and Dialects ofItaly benchmark of eleven Italian varieties.Our diagnostic suite combines tokenization cost measures (tokens per word, truncation pressure, bytes per token) with retention indicators (word split rate, continued-token rate, and type-level retention) and fragmentation proxies that reveal splitting patterns beyond fertility.We pair these diagnostics with a conservative orthography robustness protocol (diacritics, casing, punctuation and dash normalization) and assess how diagnostic changes relate to performance drops in lightweight baselines for sentence-level variety identification.We release code and derived statistics to support reproducible tokenizer audits in other low-resource settings.
%U https://aclanthology.org/2026.loreslm-1.49/
%P 570-583
Markdown (Informal)
[Tokenization Cost, Retention, and Orthography Robustness for Ladin and Italian Varieties](https://aclanthology.org/2026.loreslm-1.49/) (Staffini, LoResLM 2026)
ACL