@inproceedings{nuha-jatowt-2026-towards,
title = "Towards the First {NLP} Benchmark for {L}adin - an Extremely Low-Resource Language",
author = "Nuha, Ulin and
Jatowt, Adam",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.55/",
pages = "1049--1064",
ISBN = "979-8-89176-386-9",
abstract = "The performance of large language models (LLMs) tends to degrade for extremely low-resource languages, primarily due to the lack of labeled training data. Despite growing interest, the availability of high-quality natural language processing (NLP) datasets for these languages remains limited. This paper addresses such gap by focusing on Ladin, an endangered Romance language, specifically the Val Badia variant. Leveraging a small set of parallel Ladin{--}Italian sentence pairs, we create synthetic datasets for sentiment analysis and question answering by translating monolingual Italian data. To ensure linguistic quality, we apply rigorous filtering and back-translation procedures in our method. We further demonstrate that incorporating these synthetic datasets into machine translation training leads to substantial improvements over existing Italian{--}Ladin translation baselines. Our contributions include sentiment analysis and question answering datasets for Ladin, establishing foundational resources that support broader NLP research and downstream applications for underrepresented languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nuha-jatowt-2026-towards">
<titleInfo>
<title>Towards the First NLP Benchmark for Ladin - an Extremely Low-Resource Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ulin</namePart>
<namePart type="family">Nuha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adam</namePart>
<namePart type="family">Jatowt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>The performance of large language models (LLMs) tends to degrade for extremely low-resource languages, primarily due to the lack of labeled training data. Despite growing interest, the availability of high-quality natural language processing (NLP) datasets for these languages remains limited. This paper addresses such gap by focusing on Ladin, an endangered Romance language, specifically the Val Badia variant. Leveraging a small set of parallel Ladin–Italian sentence pairs, we create synthetic datasets for sentiment analysis and question answering by translating monolingual Italian data. To ensure linguistic quality, we apply rigorous filtering and back-translation procedures in our method. We further demonstrate that incorporating these synthetic datasets into machine translation training leads to substantial improvements over existing Italian–Ladin translation baselines. Our contributions include sentiment analysis and question answering datasets for Ladin, establishing foundational resources that support broader NLP research and downstream applications for underrepresented languages.</abstract>
<identifier type="citekey">nuha-jatowt-2026-towards</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.55/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>1049</start>
<end>1064</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards the First NLP Benchmark for Ladin - an Extremely Low-Resource Language
%A Nuha, Ulin
%A Jatowt, Adam
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F nuha-jatowt-2026-towards
%X The performance of large language models (LLMs) tends to degrade for extremely low-resource languages, primarily due to the lack of labeled training data. Despite growing interest, the availability of high-quality natural language processing (NLP) datasets for these languages remains limited. This paper addresses such gap by focusing on Ladin, an endangered Romance language, specifically the Val Badia variant. Leveraging a small set of parallel Ladin–Italian sentence pairs, we create synthetic datasets for sentiment analysis and question answering by translating monolingual Italian data. To ensure linguistic quality, we apply rigorous filtering and back-translation procedures in our method. We further demonstrate that incorporating these synthetic datasets into machine translation training leads to substantial improvements over existing Italian–Ladin translation baselines. Our contributions include sentiment analysis and question answering datasets for Ladin, establishing foundational resources that support broader NLP research and downstream applications for underrepresented languages.
%U https://aclanthology.org/2026.findings-eacl.55/
%P 1049-1064
Markdown (Informal)
[Towards the First NLP Benchmark for Ladin - an Extremely Low-Resource Language](https://aclanthology.org/2026.findings-eacl.55/) (Nuha & Jatowt, Findings 2026)
ACL