@inproceedings{chelombitko-etal-2025-samponlp,
title = "{S}ampo{NLP}: A Self-Referential Toolkit for Morphological Analysis of Subword Tokenizers",
author = "Chelombitko, Iaroslav and
Chelombitko, Ekaterina and
Komissarov, Aleksey",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
Rie{\ss}ler, Michael and
Morooka, Eiaki V. and
Kharlashkin, Lev},
booktitle = "Proceedings of the 10th International Workshop on Computational Linguistics for Uralic Languages",
month = dec,
year = "2025",
address = "Joensuu, Finland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.iwclul-1.8/",
pages = "57--67",
ISBN = "979-8-89176-360-9",
abstract = "The quality of subword tokenization is critical for Large Language Models, yet evaluating tokenizers for morphologically rich Uralic languages is hampered by the lack of clean morpheme lexicons. We introduce SampoNLP, a corpus-free toolkit for morphological lexicon creation using MDL-inspired Self-Referential Atomicity Scoring, which filters composite forms through internal structural cues - suited for low-resource settings. Using the high-purity lexicons generated by SampoNLP for Finnish, Hungarian, and Estonian, we conduct a systematic evaluation of BPE tokenizers across a range of vocabulary sizes (8k{--}256k). We propose a unified metric, the Integrated Performance Score (IPS), to navigate the trade-off between morpheme coverage and over-splitting. By analyzing the IPS curves, we identify the ``elbow points'' of diminishing returns and provide the first empirically grounded recommendations for optimal vocabulary sizes (k) in these languages. Our study not only offers practical guidance but also quantitatively demonstrates the limitations of standard BPE for highly agglutinative languages. The SampoNLP library and all generated resources are made publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chelombitko-etal-2025-samponlp">
<titleInfo>
<title>SampoNLP: A Self-Referential Toolkit for Morphological Analysis of Subword Tokenizers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Iaroslav</namePart>
<namePart type="family">Chelombitko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Chelombitko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aleksey</namePart>
<namePart type="family">Komissarov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th International Workshop on Computational Linguistics for Uralic Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Rießler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eiaki</namePart>
<namePart type="given">V</namePart>
<namePart type="family">Morooka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lev</namePart>
<namePart type="family">Kharlashkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Joensuu, Finland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-360-9</identifier>
</relatedItem>
<abstract>The quality of subword tokenization is critical for Large Language Models, yet evaluating tokenizers for morphologically rich Uralic languages is hampered by the lack of clean morpheme lexicons. We introduce SampoNLP, a corpus-free toolkit for morphological lexicon creation using MDL-inspired Self-Referential Atomicity Scoring, which filters composite forms through internal structural cues - suited for low-resource settings. Using the high-purity lexicons generated by SampoNLP for Finnish, Hungarian, and Estonian, we conduct a systematic evaluation of BPE tokenizers across a range of vocabulary sizes (8k–256k). We propose a unified metric, the Integrated Performance Score (IPS), to navigate the trade-off between morpheme coverage and over-splitting. By analyzing the IPS curves, we identify the “elbow points” of diminishing returns and provide the first empirically grounded recommendations for optimal vocabulary sizes (k) in these languages. Our study not only offers practical guidance but also quantitatively demonstrates the limitations of standard BPE for highly agglutinative languages. The SampoNLP library and all generated resources are made publicly available.</abstract>
<identifier type="citekey">chelombitko-etal-2025-samponlp</identifier>
<location>
<url>https://aclanthology.org/2025.iwclul-1.8/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>57</start>
<end>67</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SampoNLP: A Self-Referential Toolkit for Morphological Analysis of Subword Tokenizers
%A Chelombitko, Iaroslav
%A Chelombitko, Ekaterina
%A Komissarov, Aleksey
%Y Hämäläinen, Mika
%Y Rießler, Michael
%Y Morooka, Eiaki V.
%Y Kharlashkin, Lev
%S Proceedings of the 10th International Workshop on Computational Linguistics for Uralic Languages
%D 2025
%8 December
%I Association for Computational Linguistics
%C Joensuu, Finland
%@ 979-8-89176-360-9
%F chelombitko-etal-2025-samponlp
%X The quality of subword tokenization is critical for Large Language Models, yet evaluating tokenizers for morphologically rich Uralic languages is hampered by the lack of clean morpheme lexicons. We introduce SampoNLP, a corpus-free toolkit for morphological lexicon creation using MDL-inspired Self-Referential Atomicity Scoring, which filters composite forms through internal structural cues - suited for low-resource settings. Using the high-purity lexicons generated by SampoNLP for Finnish, Hungarian, and Estonian, we conduct a systematic evaluation of BPE tokenizers across a range of vocabulary sizes (8k–256k). We propose a unified metric, the Integrated Performance Score (IPS), to navigate the trade-off between morpheme coverage and over-splitting. By analyzing the IPS curves, we identify the “elbow points” of diminishing returns and provide the first empirically grounded recommendations for optimal vocabulary sizes (k) in these languages. Our study not only offers practical guidance but also quantitatively demonstrates the limitations of standard BPE for highly agglutinative languages. The SampoNLP library and all generated resources are made publicly available.
%U https://aclanthology.org/2025.iwclul-1.8/
%P 57-67
Markdown (Informal)
[SampoNLP: A Self-Referential Toolkit for Morphological Analysis of Subword Tokenizers](https://aclanthology.org/2025.iwclul-1.8/) (Chelombitko et al., IWCLUL 2025)
ACL