@inproceedings{ploeger-etal-2024-typological,
title = "What is {''}Typological Diversity{''} in {NLP}?",
author = "Ploeger, Esther and
Poelman, Wessel and
de Lhoneux, Miryam and
Bjerva, Johannes",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.326",
doi = "10.18653/v1/2024.emnlp-main.326",
pages = "5681--5700",
abstract = "The NLP research community has devoted increased attention to languages beyond English, resulting in considerable improvements for multilingual NLP. However, these improvements only apply to a small subset of the world{'}s languages. An increasing number of papers aspires to enhance generalizable multilingual performance across languages. To this end, linguistic typology is commonly used to motivate language selection, on the basis that a broad typological sample ought to imply generalization across a broad range of languages. These selections are often described as being typologically diverse. In this meta-analysis, we systematically investigate NLP research that includes claims regarding typological diversity. We find there are no set definitions or criteria for such claims. We introduce metrics to approximate the diversity of resulting language samples along several axes and find that the results vary considerably across papers. Crucially, we show that skewed language selection can lead to overestimated multilingual performance. We recommend future work to include an operationalization of typological diversity that empirically justifies the diversity of language samples. To help facilitate this, we release the code for our diversity measures.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ploeger-etal-2024-typological">
<titleInfo>
<title>What is ”Typological Diversity” in NLP?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Esther</namePart>
<namePart type="family">Ploeger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wessel</namePart>
<namePart type="family">Poelman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miryam</namePart>
<namePart type="family">de Lhoneux</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johannes</namePart>
<namePart type="family">Bjerva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The NLP research community has devoted increased attention to languages beyond English, resulting in considerable improvements for multilingual NLP. However, these improvements only apply to a small subset of the world’s languages. An increasing number of papers aspires to enhance generalizable multilingual performance across languages. To this end, linguistic typology is commonly used to motivate language selection, on the basis that a broad typological sample ought to imply generalization across a broad range of languages. These selections are often described as being typologically diverse. In this meta-analysis, we systematically investigate NLP research that includes claims regarding typological diversity. We find there are no set definitions or criteria for such claims. We introduce metrics to approximate the diversity of resulting language samples along several axes and find that the results vary considerably across papers. Crucially, we show that skewed language selection can lead to overestimated multilingual performance. We recommend future work to include an operationalization of typological diversity that empirically justifies the diversity of language samples. To help facilitate this, we release the code for our diversity measures.</abstract>
<identifier type="citekey">ploeger-etal-2024-typological</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.326</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.326</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>5681</start>
<end>5700</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T What is ”Typological Diversity” in NLP?
%A Ploeger, Esther
%A Poelman, Wessel
%A de Lhoneux, Miryam
%A Bjerva, Johannes
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F ploeger-etal-2024-typological
%X The NLP research community has devoted increased attention to languages beyond English, resulting in considerable improvements for multilingual NLP. However, these improvements only apply to a small subset of the world’s languages. An increasing number of papers aspires to enhance generalizable multilingual performance across languages. To this end, linguistic typology is commonly used to motivate language selection, on the basis that a broad typological sample ought to imply generalization across a broad range of languages. These selections are often described as being typologically diverse. In this meta-analysis, we systematically investigate NLP research that includes claims regarding typological diversity. We find there are no set definitions or criteria for such claims. We introduce metrics to approximate the diversity of resulting language samples along several axes and find that the results vary considerably across papers. Crucially, we show that skewed language selection can lead to overestimated multilingual performance. We recommend future work to include an operationalization of typological diversity that empirically justifies the diversity of language samples. To help facilitate this, we release the code for our diversity measures.
%R 10.18653/v1/2024.emnlp-main.326
%U https://aclanthology.org/2024.emnlp-main.326
%U https://doi.org/10.18653/v1/2024.emnlp-main.326
%P 5681-5700
Markdown (Informal)
[What is ”Typological Diversity” in NLP?](https://aclanthology.org/2024.emnlp-main.326) (Ploeger et al., EMNLP 2024)
ACL
- Esther Ploeger, Wessel Poelman, Miryam de Lhoneux, and Johannes Bjerva. 2024. What is ”Typological Diversity” in NLP?. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 5681–5700, Miami, Florida, USA. Association for Computational Linguistics.