@inproceedings{goot-etal-2025-morsed,
title = "{MorSeD}: {Morphological} Segmentation of {Danish} and its Effect on Language Modeling",
author = "Goot, Rob van der and
Jensen, Anette and
Schledermann, Emil Allerslev and
Kildeberg, Mikkel Wildner and
Larsen, Nicolaj and
Zhang, Mike and
Bassignana, Elisa",
editor = "Johansson, Richard and
Stymne, Sara",
booktitle = "Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)",
month = mar,
year = "2025",
address = "Tallinn, Estonia",
publisher = "University of Tartu Library",
url = "https://aclanthology.org/2025.nodalida-1.23/",
pages = "223--229",
ISBN = "978-9908-53-109-0",
abstract = "Current language models (LMs) mostly exploit subwords as input units based on statistical co-occurrences of characters. Adjacently, previous work has shown that modeling morphemes can aid performance for Natural Language Processing (NLP) models. However, morphemes are challenging to obtain as there is no annotated data in most languages. In this work, we release a wide-coverage Danish morphological segmentation evaluation set. We evaluate a range of unsupervised token segmenters and evaluate the downstream effect of using morphemes as input units for transformer-based LMs. Our results show that popular subword algorithms perform poorly on this task, scoring at most an F1 of 57.6 compared to 68.0 for an unsupervised morphological segmenter (Morfessor). Furthermore, evaluate a range of segmenters on the task of language modeling."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="goot-etal-2025-morsed">
<titleInfo>
<title>MorSeD: Morphological Segmentation of Danish and its Effect on Language Modeling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rob</namePart>
<namePart type="given">van</namePart>
<namePart type="given">der</namePart>
<namePart type="family">Goot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anette</namePart>
<namePart type="family">Jensen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emil</namePart>
<namePart type="given">Allerslev</namePart>
<namePart type="family">Schledermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikkel</namePart>
<namePart type="given">Wildner</namePart>
<namePart type="family">Kildeberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicolaj</namePart>
<namePart type="family">Larsen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elisa</namePart>
<namePart type="family">Bassignana</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Johansson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Stymne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>University of Tartu Library</publisher>
<place>
<placeTerm type="text">Tallinn, Estonia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-9908-53-109-0</identifier>
</relatedItem>
<abstract>Current language models (LMs) mostly exploit subwords as input units based on statistical co-occurrences of characters. Adjacently, previous work has shown that modeling morphemes can aid performance for Natural Language Processing (NLP) models. However, morphemes are challenging to obtain as there is no annotated data in most languages. In this work, we release a wide-coverage Danish morphological segmentation evaluation set. We evaluate a range of unsupervised token segmenters and evaluate the downstream effect of using morphemes as input units for transformer-based LMs. Our results show that popular subword algorithms perform poorly on this task, scoring at most an F1 of 57.6 compared to 68.0 for an unsupervised morphological segmenter (Morfessor). Furthermore, evaluate a range of segmenters on the task of language modeling.</abstract>
<identifier type="citekey">goot-etal-2025-morsed</identifier>
<location>
<url>https://aclanthology.org/2025.nodalida-1.23/</url>
</location>
<part>
<date>2025-03</date>
<extent unit="page">
<start>223</start>
<end>229</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MorSeD: Morphological Segmentation of Danish and its Effect on Language Modeling
%A Goot, Rob van der
%A Jensen, Anette
%A Schledermann, Emil Allerslev
%A Kildeberg, Mikkel Wildner
%A Larsen, Nicolaj
%A Zhang, Mike
%A Bassignana, Elisa
%Y Johansson, Richard
%Y Stymne, Sara
%S Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)
%D 2025
%8 March
%I University of Tartu Library
%C Tallinn, Estonia
%@ 978-9908-53-109-0
%F goot-etal-2025-morsed
%X Current language models (LMs) mostly exploit subwords as input units based on statistical co-occurrences of characters. Adjacently, previous work has shown that modeling morphemes can aid performance for Natural Language Processing (NLP) models. However, morphemes are challenging to obtain as there is no annotated data in most languages. In this work, we release a wide-coverage Danish morphological segmentation evaluation set. We evaluate a range of unsupervised token segmenters and evaluate the downstream effect of using morphemes as input units for transformer-based LMs. Our results show that popular subword algorithms perform poorly on this task, scoring at most an F1 of 57.6 compared to 68.0 for an unsupervised morphological segmenter (Morfessor). Furthermore, evaluate a range of segmenters on the task of language modeling.
%U https://aclanthology.org/2025.nodalida-1.23/
%P 223-229
Markdown (Informal)
[MorSeD: Morphological Segmentation of Danish and its Effect on Language Modeling](https://aclanthology.org/2025.nodalida-1.23/) (Goot et al., NoDaLiDa 2025)
ACL
- Rob van der Goot, Anette Jensen, Emil Allerslev Schledermann, Mikkel Wildner Kildeberg, Nicolaj Larsen, Mike Zhang, and Elisa Bassignana. 2025. MorSeD: Morphological Segmentation of Danish and its Effect on Language Modeling. In Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025), pages 223–229, Tallinn, Estonia. University of Tartu Library.