@inproceedings{ravikiran-etal-2026-tokenizers,
title = "Do Tokenizers Fail on Informal {H}indi Expressions? Evidence from Static, Downstream, and Robustness Analyses",
author = "Ravikiran, Manikandan and
Tiwari, Tanmay and
Gupta, Vibhu and
Prakash, Rakesh and
Saluja, Rohit and
Mohanty, Shayan",
editor = "Hettiarachchi, Hansi and
Ranasinghe, Tharindu and
Plum, Alistair and
Rayson, Paul and
Mitkov, Ruslan and
Gaber, Mohamed and
Premasiri, Damith and
Tan, Fiona Anting and
Uyangodage, Lasitha",
booktitle = "Proceedings of the Second Workshop on Language Models for Low-Resource Languages ({L}o{R}es{LM} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.loreslm-1.2/",
pages = "13--28",
ISBN = "979-8-89176-377-7",
abstract = "We present, to our knowledge, the first systematic evaluation of tokenization quality for \textit{informal Hindi expressions}, combining static, downstream, and robustness analyses. Our investigation centers on three questions: (RQ1) how well tokenizers preserve informal expression units using static boundary and integrity metrics, (RQ2) how tokenization choices affect downstream identification of informal expressions, and (RQ3) how robust tokenizers remain under orthographic variation, romanization, and noisy spelling. Across multilingual, Indic-focused, and byte-level tokenizers, we find that Indic-oriented models (e.g., MuRIL, IndicBERT) preserve expression boundaries better and achieve higher downstream F1 on clean text than generic multilingual models (e.g., mBERT, XLM-R). However, all tokenizers exhibit severe degradation under romanization, with phrase integrity rates approaching zero. These findings demonstrate that tokenization constitutes a hidden but critical bottleneck for informal Hindi NLP, particularly in cross-script settings, and motivate the need for tokenization strategies that explicitly account for phrase-level semantics and orthographic variation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ravikiran-etal-2026-tokenizers">
<titleInfo>
<title>Do Tokenizers Fail on Informal Hindi Expressions? Evidence from Static, Downstream, and Robustness Analyses</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manikandan</namePart>
<namePart type="family">Ravikiran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmay</namePart>
<namePart type="family">Tiwari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vibhu</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rakesh</namePart>
<namePart type="family">Prakash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rohit</namePart>
<namePart type="family">Saluja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shayan</namePart>
<namePart type="family">Mohanty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hansi</namePart>
<namePart type="family">Hettiarachchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Ranasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alistair</namePart>
<namePart type="family">Plum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Gaber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Damith</namePart>
<namePart type="family">Premasiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fiona</namePart>
<namePart type="given">Anting</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lasitha</namePart>
<namePart type="family">Uyangodage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-377-7</identifier>
</relatedItem>
<abstract>We present, to our knowledge, the first systematic evaluation of tokenization quality for informal Hindi expressions, combining static, downstream, and robustness analyses. Our investigation centers on three questions: (RQ1) how well tokenizers preserve informal expression units using static boundary and integrity metrics, (RQ2) how tokenization choices affect downstream identification of informal expressions, and (RQ3) how robust tokenizers remain under orthographic variation, romanization, and noisy spelling. Across multilingual, Indic-focused, and byte-level tokenizers, we find that Indic-oriented models (e.g., MuRIL, IndicBERT) preserve expression boundaries better and achieve higher downstream F1 on clean text than generic multilingual models (e.g., mBERT, XLM-R). However, all tokenizers exhibit severe degradation under romanization, with phrase integrity rates approaching zero. These findings demonstrate that tokenization constitutes a hidden but critical bottleneck for informal Hindi NLP, particularly in cross-script settings, and motivate the need for tokenization strategies that explicitly account for phrase-level semantics and orthographic variation.</abstract>
<identifier type="citekey">ravikiran-etal-2026-tokenizers</identifier>
<location>
<url>https://aclanthology.org/2026.loreslm-1.2/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>13</start>
<end>28</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Do Tokenizers Fail on Informal Hindi Expressions? Evidence from Static, Downstream, and Robustness Analyses
%A Ravikiran, Manikandan
%A Tiwari, Tanmay
%A Gupta, Vibhu
%A Prakash, Rakesh
%A Saluja, Rohit
%A Mohanty, Shayan
%Y Hettiarachchi, Hansi
%Y Ranasinghe, Tharindu
%Y Plum, Alistair
%Y Rayson, Paul
%Y Mitkov, Ruslan
%Y Gaber, Mohamed
%Y Premasiri, Damith
%Y Tan, Fiona Anting
%Y Uyangodage, Lasitha
%S Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-377-7
%F ravikiran-etal-2026-tokenizers
%X We present, to our knowledge, the first systematic evaluation of tokenization quality for informal Hindi expressions, combining static, downstream, and robustness analyses. Our investigation centers on three questions: (RQ1) how well tokenizers preserve informal expression units using static boundary and integrity metrics, (RQ2) how tokenization choices affect downstream identification of informal expressions, and (RQ3) how robust tokenizers remain under orthographic variation, romanization, and noisy spelling. Across multilingual, Indic-focused, and byte-level tokenizers, we find that Indic-oriented models (e.g., MuRIL, IndicBERT) preserve expression boundaries better and achieve higher downstream F1 on clean text than generic multilingual models (e.g., mBERT, XLM-R). However, all tokenizers exhibit severe degradation under romanization, with phrase integrity rates approaching zero. These findings demonstrate that tokenization constitutes a hidden but critical bottleneck for informal Hindi NLP, particularly in cross-script settings, and motivate the need for tokenization strategies that explicitly account for phrase-level semantics and orthographic variation.
%U https://aclanthology.org/2026.loreslm-1.2/
%P 13-28
Markdown (Informal)
[Do Tokenizers Fail on Informal Hindi Expressions? Evidence from Static, Downstream, and Robustness Analyses](https://aclanthology.org/2026.loreslm-1.2/) (Ravikiran et al., LoResLM 2026)
ACL
- Manikandan Ravikiran, Tanmay Tiwari, Vibhu Gupta, Rakesh Prakash, Rohit Saluja, and Shayan Mohanty. 2026. Do Tokenizers Fail on Informal Hindi Expressions? Evidence from Static, Downstream, and Robustness Analyses. In Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026), pages 13–28, Rabat, Morocco. Association for Computational Linguistics.