@inproceedings{valer-etal-2024-nesciun,
title = "Nesciun Lengaz Lasci{\`a} End{\`o}: Machine Translation for Fassa {L}adin",
author = "Valer, Giovanni and
Penzo, Nicol{\`o} and
Staiano, Jacopo",
editor = "Dell'Orletta, Felice and
Lenci, Alessandro and
Montemagni, Simonetta and
Sprugnoli, Rachele",
booktitle = "Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)",
month = dec,
year = "2024",
address = "Pisa, Italy",
publisher = "CEUR Workshop Proceedings",
url = "https://aclanthology.org/2024.clicit-1.104/",
pages = "967--975",
ISBN = "979-12-210-7060-6",
abstract = "Despite the remarkable success recently obtained by Large Language Models, a significant gap in performance still exists when dealing with low-resource languages which are often poorly supported by off-the-shelf models. In this work we focus on Fassa Ladin, a Rhaeto-Romance linguistic variety spoken by less than ten thousand people in the Dolomitic regions, and set to build the first bidirectional Machine Translation system supporting Italian, English, and Fassa Ladin. To this end, we collected a small though representative corpus compounding 1135 parallel sentences in these three languages, and spanning five domains. We evaluated several models including the open (Meta AI`s No Language Left Behind, NLLB-200) and commercial (OpenAI`s gpt-4o) state-of-the-art, and indeed found that both obtain unsatisfactory performance. We therefore proceeded to finetune the NLLB-200 model on the data collected, using different approaches. We report a comparative analysis of the results obtained, showing that 1) jointly training for multilingual translation (Ladin-Italian and Ladin-English) significantly improves the performance, and 2) knowledge-transfer is highly effective (e.g., leveraging similarities between Ladin and Friulian), highlighting the importance of targeted data collection and model adaptation in the context of low-resource/endangered languages for which little textual data is available."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="valer-etal-2024-nesciun">
<titleInfo>
<title>Nesciun Lengaz Lascià Endò: Machine Translation for Fassa Ladin</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giovanni</namePart>
<namePart type="family">Valer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicolò</namePart>
<namePart type="family">Penzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jacopo</namePart>
<namePart type="family">Staiano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felice</namePart>
<namePart type="family">Dell’Orletta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simonetta</namePart>
<namePart type="family">Montemagni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachele</namePart>
<namePart type="family">Sprugnoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>CEUR Workshop Proceedings</publisher>
<place>
<placeTerm type="text">Pisa, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-12-210-7060-6</identifier>
</relatedItem>
<abstract>Despite the remarkable success recently obtained by Large Language Models, a significant gap in performance still exists when dealing with low-resource languages which are often poorly supported by off-the-shelf models. In this work we focus on Fassa Ladin, a Rhaeto-Romance linguistic variety spoken by less than ten thousand people in the Dolomitic regions, and set to build the first bidirectional Machine Translation system supporting Italian, English, and Fassa Ladin. To this end, we collected a small though representative corpus compounding 1135 parallel sentences in these three languages, and spanning five domains. We evaluated several models including the open (Meta AI‘s No Language Left Behind, NLLB-200) and commercial (OpenAI‘s gpt-4o) state-of-the-art, and indeed found that both obtain unsatisfactory performance. We therefore proceeded to finetune the NLLB-200 model on the data collected, using different approaches. We report a comparative analysis of the results obtained, showing that 1) jointly training for multilingual translation (Ladin-Italian and Ladin-English) significantly improves the performance, and 2) knowledge-transfer is highly effective (e.g., leveraging similarities between Ladin and Friulian), highlighting the importance of targeted data collection and model adaptation in the context of low-resource/endangered languages for which little textual data is available.</abstract>
<identifier type="citekey">valer-etal-2024-nesciun</identifier>
<location>
<url>https://aclanthology.org/2024.clicit-1.104/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>967</start>
<end>975</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Nesciun Lengaz Lascià Endò: Machine Translation for Fassa Ladin
%A Valer, Giovanni
%A Penzo, Nicolò
%A Staiano, Jacopo
%Y Dell’Orletta, Felice
%Y Lenci, Alessandro
%Y Montemagni, Simonetta
%Y Sprugnoli, Rachele
%S Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)
%D 2024
%8 December
%I CEUR Workshop Proceedings
%C Pisa, Italy
%@ 979-12-210-7060-6
%F valer-etal-2024-nesciun
%X Despite the remarkable success recently obtained by Large Language Models, a significant gap in performance still exists when dealing with low-resource languages which are often poorly supported by off-the-shelf models. In this work we focus on Fassa Ladin, a Rhaeto-Romance linguistic variety spoken by less than ten thousand people in the Dolomitic regions, and set to build the first bidirectional Machine Translation system supporting Italian, English, and Fassa Ladin. To this end, we collected a small though representative corpus compounding 1135 parallel sentences in these three languages, and spanning five domains. We evaluated several models including the open (Meta AI‘s No Language Left Behind, NLLB-200) and commercial (OpenAI‘s gpt-4o) state-of-the-art, and indeed found that both obtain unsatisfactory performance. We therefore proceeded to finetune the NLLB-200 model on the data collected, using different approaches. We report a comparative analysis of the results obtained, showing that 1) jointly training for multilingual translation (Ladin-Italian and Ladin-English) significantly improves the performance, and 2) knowledge-transfer is highly effective (e.g., leveraging similarities between Ladin and Friulian), highlighting the importance of targeted data collection and model adaptation in the context of low-resource/endangered languages for which little textual data is available.
%U https://aclanthology.org/2024.clicit-1.104/
%P 967-975
Markdown (Informal)
[Nesciun Lengaz Lascià Endò: Machine Translation for Fassa Ladin](https://aclanthology.org/2024.clicit-1.104/) (Valer et al., CLiC-it 2024)
ACL