@inproceedings{goot-etal-2025-distals,
title = "{D}ista{L}s: a Comprehensive Collection of Language Distance Measures",
author = "Goot, Rob Van Der and
Ploeger, Esther and
Blaschke, Verena and
Samardzic, Tanja",
editor = {Habernal, Ivan and
Schulam, Peter and
Tiedemann, J{\"o}rg},
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-demos.23/",
pages = "307--318",
ISBN = "979-8-89176-334-0",
abstract = "Languages vary along a wide variety of dimensions. In Natural Language Processing (NLP), it is useful to know how ``distant'' languages are from each other, so that we can inform NLP models about these differences or predict good transfer languages. Furthermore, it can inform us about how diverse language samples are. However, there are many different perspectives on how distances across languages could be measured, and previous work has predominantly focused on either intuition or a single type of distance, like genealogical or typological distance. Therefore, we propose DistaLs, a toolkit that is designed to provide users with easy access to a wide variety of language distance measures. We also propose a filtered subset, which contains less redundant and more reliable features. DistaLs is designed to be accessible for a variety of use cases, and offers a Python, CLI, and web interface. It is easily updateable, and available as a pip package. Finally, we provide a case-study in which we use DistaLs to measure correlations of distance measures with performance on four different morphosyntactic tasks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="goot-etal-2025-distals">
<titleInfo>
<title>DistaLs: a Comprehensive Collection of Language Distance Measures</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rob</namePart>
<namePart type="given">Van</namePart>
<namePart type="given">Der</namePart>
<namePart type="family">Goot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Esther</namePart>
<namePart type="family">Ploeger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Verena</namePart>
<namePart type="family">Blaschke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanja</namePart>
<namePart type="family">Samardzic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Habernal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Schulam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-334-0</identifier>
</relatedItem>
<abstract>Languages vary along a wide variety of dimensions. In Natural Language Processing (NLP), it is useful to know how “distant” languages are from each other, so that we can inform NLP models about these differences or predict good transfer languages. Furthermore, it can inform us about how diverse language samples are. However, there are many different perspectives on how distances across languages could be measured, and previous work has predominantly focused on either intuition or a single type of distance, like genealogical or typological distance. Therefore, we propose DistaLs, a toolkit that is designed to provide users with easy access to a wide variety of language distance measures. We also propose a filtered subset, which contains less redundant and more reliable features. DistaLs is designed to be accessible for a variety of use cases, and offers a Python, CLI, and web interface. It is easily updateable, and available as a pip package. Finally, we provide a case-study in which we use DistaLs to measure correlations of distance measures with performance on four different morphosyntactic tasks.</abstract>
<identifier type="citekey">goot-etal-2025-distals</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-demos.23/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>307</start>
<end>318</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DistaLs: a Comprehensive Collection of Language Distance Measures
%A Goot, Rob Van Der
%A Ploeger, Esther
%A Blaschke, Verena
%A Samardzic, Tanja
%Y Habernal, Ivan
%Y Schulam, Peter
%Y Tiedemann, Jörg
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: System Demonstrations
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-334-0
%F goot-etal-2025-distals
%X Languages vary along a wide variety of dimensions. In Natural Language Processing (NLP), it is useful to know how “distant” languages are from each other, so that we can inform NLP models about these differences or predict good transfer languages. Furthermore, it can inform us about how diverse language samples are. However, there are many different perspectives on how distances across languages could be measured, and previous work has predominantly focused on either intuition or a single type of distance, like genealogical or typological distance. Therefore, we propose DistaLs, a toolkit that is designed to provide users with easy access to a wide variety of language distance measures. We also propose a filtered subset, which contains less redundant and more reliable features. DistaLs is designed to be accessible for a variety of use cases, and offers a Python, CLI, and web interface. It is easily updateable, and available as a pip package. Finally, we provide a case-study in which we use DistaLs to measure correlations of distance measures with performance on four different morphosyntactic tasks.
%U https://aclanthology.org/2025.emnlp-demos.23/
%P 307-318
Markdown (Informal)
[DistaLs: a Comprehensive Collection of Language Distance Measures](https://aclanthology.org/2025.emnlp-demos.23/) (Goot et al., EMNLP 2025)
ACL
- Rob Van Der Goot, Esther Ploeger, Verena Blaschke, and Tanja Samardzic. 2025. DistaLs: a Comprehensive Collection of Language Distance Measures. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pages 307–318, Suzhou, China. Association for Computational Linguistics.