@inproceedings{haberland-etal-2024-italian,
title = "{I}talian-{L}igurian Machine Translation in Its Cultural Context",
author = "Haberland, Christopher R. and
Maillard, Jean and
Lusito, Stefano",
editor = "Melero, Maite and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.sigul-1.21",
pages = "168--176",
abstract = "Large multilingual machine translation efforts are driving improved access and performance for under-resourced languages, but often fail to translate culturally specific and local concepts. Additionally, translation from practically relevant input languages may flag behind those that are comparatively over-represented in the training dataset. In this work, we release a new corpus, ZenaMT, containing 7,561 parallel Ligurian-Italian sentences, nearly a fifth of which are also translated in English. This corpus spans five domains: local and international news, Ligurian literature, Genoese Ligurian linguistics concepts, traditional card game rules, and Ligurian geographic expressions. We find that a translation model augmented with ZenaMT improves a baseline by 20{\%}, and by over 25{\%} (BLEU) compared to NLLB-3.3B, which is over 50 times the size. Our results demonstrate the utility of creating data sets for MT that are specifically tailored for the cultural context of Ligurian speakers. We freely release ZenaMT and expect to periodically update the corpus to improve MT performance and domain coverage.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="haberland-etal-2024-italian">
<titleInfo>
<title>Italian-Ligurian Machine Translation in Its Cultural Context</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Haberland</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jean</namePart>
<namePart type="family">Maillard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefano</namePart>
<namePart type="family">Lusito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maite</namePart>
<namePart type="family">Melero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large multilingual machine translation efforts are driving improved access and performance for under-resourced languages, but often fail to translate culturally specific and local concepts. Additionally, translation from practically relevant input languages may flag behind those that are comparatively over-represented in the training dataset. In this work, we release a new corpus, ZenaMT, containing 7,561 parallel Ligurian-Italian sentences, nearly a fifth of which are also translated in English. This corpus spans five domains: local and international news, Ligurian literature, Genoese Ligurian linguistics concepts, traditional card game rules, and Ligurian geographic expressions. We find that a translation model augmented with ZenaMT improves a baseline by 20%, and by over 25% (BLEU) compared to NLLB-3.3B, which is over 50 times the size. Our results demonstrate the utility of creating data sets for MT that are specifically tailored for the cultural context of Ligurian speakers. We freely release ZenaMT and expect to periodically update the corpus to improve MT performance and domain coverage.</abstract>
<identifier type="citekey">haberland-etal-2024-italian</identifier>
<location>
<url>https://aclanthology.org/2024.sigul-1.21</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>168</start>
<end>176</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Italian-Ligurian Machine Translation in Its Cultural Context
%A Haberland, Christopher R.
%A Maillard, Jean
%A Lusito, Stefano
%Y Melero, Maite
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F haberland-etal-2024-italian
%X Large multilingual machine translation efforts are driving improved access and performance for under-resourced languages, but often fail to translate culturally specific and local concepts. Additionally, translation from practically relevant input languages may flag behind those that are comparatively over-represented in the training dataset. In this work, we release a new corpus, ZenaMT, containing 7,561 parallel Ligurian-Italian sentences, nearly a fifth of which are also translated in English. This corpus spans five domains: local and international news, Ligurian literature, Genoese Ligurian linguistics concepts, traditional card game rules, and Ligurian geographic expressions. We find that a translation model augmented with ZenaMT improves a baseline by 20%, and by over 25% (BLEU) compared to NLLB-3.3B, which is over 50 times the size. Our results demonstrate the utility of creating data sets for MT that are specifically tailored for the cultural context of Ligurian speakers. We freely release ZenaMT and expect to periodically update the corpus to improve MT performance and domain coverage.
%U https://aclanthology.org/2024.sigul-1.21
%P 168-176
Markdown (Informal)
[Italian-Ligurian Machine Translation in Its Cultural Context](https://aclanthology.org/2024.sigul-1.21) (Haberland et al., SIGUL-WS 2024)
ACL