@inproceedings{zeh-etal-2026-globlingdiv-global,
title = "{G}lob{L}ing{D}iv: A global dataset linking linguistic diversity and digital support to reveal landscapes with under-resourced languages for {NLP}",
author = {Zeh, Katharina and
Essfors, Hannes and
Benson, Juliane and
T{\"u}ver, Lale and
Baumann, Andreas and
Fellner, Hannes A.},
editor = "Alves, Diego and
Bizzoni, Yuri and
Degaetano-Ortlieb, Stefania and
Kazantseva, Anna and
Pagel, Janis and
Szpakowicz, Stan",
booktitle = "Proceedings of the 10th Joint {SIGHUM} Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.latechclfl-1.8/",
pages = "80--86",
ISBN = "979-8-89176-373-9",
abstract = "Linguistic diversity is increasingly under pressure globally and is becoming ever more relevant in digital contexts, where many languages remain structurally under-resourced, limiting access to language technologies and inhibiting equitable NLP development. To support linguistic diversity, publicly available data are needed that capture both the number of languages spoken and the distribution of speakers across them. We introduce GlobLingDiv, a database that uses country-level speaker distributions to derive language richness and entropy-based diversity measures, alongside a population-weighted digital language support measure. Applying these metrics globally, we examine the association between linguistic diversity and digital support conditions. The results reveal a substantial imbalance: highly diverse linguistic landscapes show comparatively low digital support, underscoring the need for more inclusive NLP environments."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zeh-etal-2026-globlingdiv-global">
<titleInfo>
<title>GlobLingDiv: A global dataset linking linguistic diversity and digital support to reveal landscapes with under-resourced languages for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Katharina</namePart>
<namePart type="family">Zeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hannes</namePart>
<namePart type="family">Essfors</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juliane</namePart>
<namePart type="family">Benson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lale</namePart>
<namePart type="family">Tüver</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Baumann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hannes</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Fellner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Diego</namePart>
<namePart type="family">Alves</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefania</namePart>
<namePart type="family">Degaetano-Ortlieb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Kazantseva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Janis</namePart>
<namePart type="family">Pagel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stan</namePart>
<namePart type="family">Szpakowicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-373-9</identifier>
</relatedItem>
<abstract>Linguistic diversity is increasingly under pressure globally and is becoming ever more relevant in digital contexts, where many languages remain structurally under-resourced, limiting access to language technologies and inhibiting equitable NLP development. To support linguistic diversity, publicly available data are needed that capture both the number of languages spoken and the distribution of speakers across them. We introduce GlobLingDiv, a database that uses country-level speaker distributions to derive language richness and entropy-based diversity measures, alongside a population-weighted digital language support measure. Applying these metrics globally, we examine the association between linguistic diversity and digital support conditions. The results reveal a substantial imbalance: highly diverse linguistic landscapes show comparatively low digital support, underscoring the need for more inclusive NLP environments.</abstract>
<identifier type="citekey">zeh-etal-2026-globlingdiv-global</identifier>
<location>
<url>https://aclanthology.org/2026.latechclfl-1.8/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>80</start>
<end>86</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GlobLingDiv: A global dataset linking linguistic diversity and digital support to reveal landscapes with under-resourced languages for NLP
%A Zeh, Katharina
%A Essfors, Hannes
%A Benson, Juliane
%A Tüver, Lale
%A Baumann, Andreas
%A Fellner, Hannes A.
%Y Alves, Diego
%Y Bizzoni, Yuri
%Y Degaetano-Ortlieb, Stefania
%Y Kazantseva, Anna
%Y Pagel, Janis
%Y Szpakowicz, Stan
%S Proceedings of the 10th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-373-9
%F zeh-etal-2026-globlingdiv-global
%X Linguistic diversity is increasingly under pressure globally and is becoming ever more relevant in digital contexts, where many languages remain structurally under-resourced, limiting access to language technologies and inhibiting equitable NLP development. To support linguistic diversity, publicly available data are needed that capture both the number of languages spoken and the distribution of speakers across them. We introduce GlobLingDiv, a database that uses country-level speaker distributions to derive language richness and entropy-based diversity measures, alongside a population-weighted digital language support measure. Applying these metrics globally, we examine the association between linguistic diversity and digital support conditions. The results reveal a substantial imbalance: highly diverse linguistic landscapes show comparatively low digital support, underscoring the need for more inclusive NLP environments.
%U https://aclanthology.org/2026.latechclfl-1.8/
%P 80-86
Markdown (Informal)
[GlobLingDiv: A global dataset linking linguistic diversity and digital support to reveal landscapes with under-resourced languages for NLP](https://aclanthology.org/2026.latechclfl-1.8/) (Zeh et al., LaTeCH-CLfL 2026)
ACL
- Katharina Zeh, Hannes Essfors, Juliane Benson, Lale Tüver, Andreas Baumann, and Hannes A. Fellner. 2026. GlobLingDiv: A global dataset linking linguistic diversity and digital support to reveal landscapes with under-resourced languages for NLP. In Proceedings of the 10th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026, pages 80–86, Rabat, Morocco. Association for Computational Linguistics.