@inproceedings{ploeger-etal-2025-tokenization,
title = "Tokenization on Trial: {The} Case of {Kalaallisut}{--}{Danish} Legal Machine Translation",
author = "Ploeger, Esther and
Saucedo, Paola and
Bjerva, Johannes and
Kristensen-McLachlan, Ross Deans and
Lent, Heather",
editor = "Johansson, Richard and
Stymne, Sara",
booktitle = "Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)",
month = mar,
year = "2025",
address = "Tallinn, Estonia",
publisher = "University of Tartu Library",
url = "https://aclanthology.org/2025.nodalida-1.52/",
pages = "480--491",
ISBN = "978-9908-53-109-0",
abstract = "The strengths of subword tokenization have been widely demonstrated when applied to higher-resourced, morphologically simple languages. However, it is not self-evident that these results transfer to lower-resourced, morphologically complex languages. In this work, we investigate the influence of different subword segmentation techniques on machine translation between Danish and Kalaallisut, the official language of Greenland. We present the first semi-manually aligned parallel corpus for this language pair, and use it to compare subwords from unsupervised tokenizers and morphological segmenters. We find that Unigram-based segmentation both preserves morphological boundaries and handles out-of-vocabulary words adequately, but that this does not directly correspond to superior translation quality. We hope that our findings lay further groundwork for future efforts in neural machine translation for Kalaallisut."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ploeger-etal-2025-tokenization">
<titleInfo>
<title>Tokenization on Trial: The Case of Kalaallisut–Danish Legal Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Esther</namePart>
<namePart type="family">Ploeger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paola</namePart>
<namePart type="family">Saucedo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johannes</namePart>
<namePart type="family">Bjerva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ross</namePart>
<namePart type="given">Deans</namePart>
<namePart type="family">Kristensen-McLachlan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heather</namePart>
<namePart type="family">Lent</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Johansson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Stymne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>University of Tartu Library</publisher>
<place>
<placeTerm type="text">Tallinn, Estonia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-9908-53-109-0</identifier>
</relatedItem>
<abstract>The strengths of subword tokenization have been widely demonstrated when applied to higher-resourced, morphologically simple languages. However, it is not self-evident that these results transfer to lower-resourced, morphologically complex languages. In this work, we investigate the influence of different subword segmentation techniques on machine translation between Danish and Kalaallisut, the official language of Greenland. We present the first semi-manually aligned parallel corpus for this language pair, and use it to compare subwords from unsupervised tokenizers and morphological segmenters. We find that Unigram-based segmentation both preserves morphological boundaries and handles out-of-vocabulary words adequately, but that this does not directly correspond to superior translation quality. We hope that our findings lay further groundwork for future efforts in neural machine translation for Kalaallisut.</abstract>
<identifier type="citekey">ploeger-etal-2025-tokenization</identifier>
<location>
<url>https://aclanthology.org/2025.nodalida-1.52/</url>
</location>
<part>
<date>2025-03</date>
<extent unit="page">
<start>480</start>
<end>491</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tokenization on Trial: The Case of Kalaallisut–Danish Legal Machine Translation
%A Ploeger, Esther
%A Saucedo, Paola
%A Bjerva, Johannes
%A Kristensen-McLachlan, Ross Deans
%A Lent, Heather
%Y Johansson, Richard
%Y Stymne, Sara
%S Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)
%D 2025
%8 March
%I University of Tartu Library
%C Tallinn, Estonia
%@ 978-9908-53-109-0
%F ploeger-etal-2025-tokenization
%X The strengths of subword tokenization have been widely demonstrated when applied to higher-resourced, morphologically simple languages. However, it is not self-evident that these results transfer to lower-resourced, morphologically complex languages. In this work, we investigate the influence of different subword segmentation techniques on machine translation between Danish and Kalaallisut, the official language of Greenland. We present the first semi-manually aligned parallel corpus for this language pair, and use it to compare subwords from unsupervised tokenizers and morphological segmenters. We find that Unigram-based segmentation both preserves morphological boundaries and handles out-of-vocabulary words adequately, but that this does not directly correspond to superior translation quality. We hope that our findings lay further groundwork for future efforts in neural machine translation for Kalaallisut.
%U https://aclanthology.org/2025.nodalida-1.52/
%P 480-491
Markdown (Informal)
[Tokenization on Trial: The Case of Kalaallisut–Danish Legal Machine Translation](https://aclanthology.org/2025.nodalida-1.52/) (Ploeger et al., NoDaLiDa 2025)
ACL
- Esther Ploeger, Paola Saucedo, Johannes Bjerva, Ross Deans Kristensen-McLachlan, and Heather Lent. 2025. Tokenization on Trial: The Case of Kalaallisut–Danish Legal Machine Translation. In Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025), pages 480–491, Tallinn, Estonia. University of Tartu Library.