@inproceedings{rubehn-etal-2025-annotating,
title = "Annotating and Inferring Compositional Structures in Numeral Systems Across Languages",
author = "Rubehn, Arne and
Rzymski, Christoph and
Ciucci, Luca and
Bocklage, Katja and
Ku{\v{c}}erov{\'a}, Al{\v{z}}b{\v{e}}ta and
Snee, David and
Stephen, Abishek and
Dam, Kellen Parker Van and
List, Johann-Mattis",
editor = "Hahn, Michael and
Rani, Priya and
Kumar, Ritesh and
Shcherbakov, Andreas and
Sorokin, Alexey and
Serikov, Oleg and
Cotterell, Ryan and
Vylomova, Ekaterina",
booktitle = "Proceedings of the 7th Workshop on Research in Computational Linguistic Typology and Multilingual NLP",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sigtyp-1.4/",
doi = "10.18653/v1/2025.sigtyp-1.4",
pages = "29--42",
ISBN = "979-8-89176-281-7",
abstract = "Numeral systems across the world{'}s languages vary in fascinating ways, both regarding their synchronic structure and the diachronic processes that determined how they evolved in their current shape. For a proper comparison of numeral systems across different languages, however, it is important to code them in a standardized form that allows for the comparison of basic properties. Here, we present a simple but effective coding scheme for numeral annotation, along with a workflow that helps to code numeral systems in a computer-assisted manner, providing sample data for numerals from 1 to 40 in 25 typologically diverse languages. We perform a thorough analysis of the sample, focusing on the systematic comparison between the underlying and the surface morphological structure. We further experiment with automated models for morpheme segmentation, where we find allomorphy as the major reason for segmentation errors. Finally, we show that subword tokenization algorithms are not viable for discovering morphemes in low-resource scenarios."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rubehn-etal-2025-annotating">
<titleInfo>
<title>Annotating and Inferring Compositional Structures in Numeral Systems Across Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arne</namePart>
<namePart type="family">Rubehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christoph</namePart>
<namePart type="family">Rzymski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luca</namePart>
<namePart type="family">Ciucci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katja</namePart>
<namePart type="family">Bocklage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alžběta</namePart>
<namePart type="family">Kučerová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Snee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abishek</namePart>
<namePart type="family">Stephen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kellen</namePart>
<namePart type="given">Parker</namePart>
<namePart type="given">Van</namePart>
<namePart type="family">Dam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johann-Mattis</namePart>
<namePart type="family">List</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Workshop on Research in Computational Linguistic Typology and Multilingual NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Hahn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Priya</namePart>
<namePart type="family">Rani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ritesh</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Shcherbakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexey</namePart>
<namePart type="family">Sorokin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Serikov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Vylomova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-281-7</identifier>
</relatedItem>
<abstract>Numeral systems across the world’s languages vary in fascinating ways, both regarding their synchronic structure and the diachronic processes that determined how they evolved in their current shape. For a proper comparison of numeral systems across different languages, however, it is important to code them in a standardized form that allows for the comparison of basic properties. Here, we present a simple but effective coding scheme for numeral annotation, along with a workflow that helps to code numeral systems in a computer-assisted manner, providing sample data for numerals from 1 to 40 in 25 typologically diverse languages. We perform a thorough analysis of the sample, focusing on the systematic comparison between the underlying and the surface morphological structure. We further experiment with automated models for morpheme segmentation, where we find allomorphy as the major reason for segmentation errors. Finally, we show that subword tokenization algorithms are not viable for discovering morphemes in low-resource scenarios.</abstract>
<identifier type="citekey">rubehn-etal-2025-annotating</identifier>
<identifier type="doi">10.18653/v1/2025.sigtyp-1.4</identifier>
<location>
<url>https://aclanthology.org/2025.sigtyp-1.4/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>29</start>
<end>42</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Annotating and Inferring Compositional Structures in Numeral Systems Across Languages
%A Rubehn, Arne
%A Rzymski, Christoph
%A Ciucci, Luca
%A Bocklage, Katja
%A Kučerová, Alžběta
%A Snee, David
%A Stephen, Abishek
%A Dam, Kellen Parker Van
%A List, Johann-Mattis
%Y Hahn, Michael
%Y Rani, Priya
%Y Kumar, Ritesh
%Y Shcherbakov, Andreas
%Y Sorokin, Alexey
%Y Serikov, Oleg
%Y Cotterell, Ryan
%Y Vylomova, Ekaterina
%S Proceedings of the 7th Workshop on Research in Computational Linguistic Typology and Multilingual NLP
%D 2025
%8 August
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-281-7
%F rubehn-etal-2025-annotating
%X Numeral systems across the world’s languages vary in fascinating ways, both regarding their synchronic structure and the diachronic processes that determined how they evolved in their current shape. For a proper comparison of numeral systems across different languages, however, it is important to code them in a standardized form that allows for the comparison of basic properties. Here, we present a simple but effective coding scheme for numeral annotation, along with a workflow that helps to code numeral systems in a computer-assisted manner, providing sample data for numerals from 1 to 40 in 25 typologically diverse languages. We perform a thorough analysis of the sample, focusing on the systematic comparison between the underlying and the surface morphological structure. We further experiment with automated models for morpheme segmentation, where we find allomorphy as the major reason for segmentation errors. Finally, we show that subword tokenization algorithms are not viable for discovering morphemes in low-resource scenarios.
%R 10.18653/v1/2025.sigtyp-1.4
%U https://aclanthology.org/2025.sigtyp-1.4/
%U https://doi.org/10.18653/v1/2025.sigtyp-1.4
%P 29-42
Markdown (Informal)
[Annotating and Inferring Compositional Structures in Numeral Systems Across Languages](https://aclanthology.org/2025.sigtyp-1.4/) (Rubehn et al., SIGTYP 2025)
ACL
- Arne Rubehn, Christoph Rzymski, Luca Ciucci, Katja Bocklage, Alžběta Kučerová, David Snee, Abishek Stephen, Kellen Parker Van Dam, and Johann-Mattis List. 2025. Annotating and Inferring Compositional Structures in Numeral Systems Across Languages. In Proceedings of the 7th Workshop on Research in Computational Linguistic Typology and Multilingual NLP, pages 29–42, Vienna, Austria. Association for Computational Linguistics.