@inproceedings{sant-etal-2024-training,
title = "Training and Fine-Tuning {NMT} Models for Low-Resource Languages Using Apertium-Based Synthetic Corpora",
author = "Sant, Aleix and
Bardanca, Daniel and
Pichel Campos, Jos{\'e} Ramom and
De Luca Fornaciari, Francesca and
Escolano, Carlos and
Garcia Gilabert, Javier and
Gamallo, Pablo and
Mash, Audrey and
Liao, Xixian and
Melero, Maite",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Ninth Conference on Machine Translation",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wmt-1.90",
pages = "925--933",
abstract = "In this paper, we present the two strategies employed for the WMT24 Shared Task on Translation into Low-Resource Languages of Spain. We participated in the language pairs of Spanish-to-Aragonese, Spanish-to-Aranese, and Spanish-to-Asturian, developing neural-based translation systems and moving away from rule-based approaches for these language directions. To create these models, two distinct strategies were employed. The first strategy involved a thorough cleaning process and curation of the limited provided data, followed by fine-tuning the multilingual NLLB-200-600M model (Constrained Submission). The other strategy involved training a transformer from scratch using a vast amount of synthetic data (Open Submission). Both approaches relied on generated synthetic data and resulted in high ChrF and BLEU scores. However, given the characteristics of the task, the strategy used in the Constrained Submission resulted in higher scores that surpassed the baselines across the three translation directions, whereas the strategy employed in the Open Submission yielded slightly lower scores than the highest baseline.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sant-etal-2024-training">
<titleInfo>
<title>Training and Fine-Tuning NMT Models for Low-Resource Languages Using Apertium-Based Synthetic Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aleix</namePart>
<namePart type="family">Sant</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Bardanca</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">José</namePart>
<namePart type="given">Ramom</namePart>
<namePart type="family">Pichel Campos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francesca</namePart>
<namePart type="family">De Luca Fornaciari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carlos</namePart>
<namePart type="family">Escolano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Javier</namePart>
<namePart type="family">Garcia Gilabert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pablo</namePart>
<namePart type="family">Gamallo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Audrey</namePart>
<namePart type="family">Mash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xixian</namePart>
<namePart type="family">Liao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maite</namePart>
<namePart type="family">Melero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we present the two strategies employed for the WMT24 Shared Task on Translation into Low-Resource Languages of Spain. We participated in the language pairs of Spanish-to-Aragonese, Spanish-to-Aranese, and Spanish-to-Asturian, developing neural-based translation systems and moving away from rule-based approaches for these language directions. To create these models, two distinct strategies were employed. The first strategy involved a thorough cleaning process and curation of the limited provided data, followed by fine-tuning the multilingual NLLB-200-600M model (Constrained Submission). The other strategy involved training a transformer from scratch using a vast amount of synthetic data (Open Submission). Both approaches relied on generated synthetic data and resulted in high ChrF and BLEU scores. However, given the characteristics of the task, the strategy used in the Constrained Submission resulted in higher scores that surpassed the baselines across the three translation directions, whereas the strategy employed in the Open Submission yielded slightly lower scores than the highest baseline.</abstract>
<identifier type="citekey">sant-etal-2024-training</identifier>
<location>
<url>https://aclanthology.org/2024.wmt-1.90</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>925</start>
<end>933</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Training and Fine-Tuning NMT Models for Low-Resource Languages Using Apertium-Based Synthetic Corpora
%A Sant, Aleix
%A Bardanca, Daniel
%A Pichel Campos, José Ramom
%A De Luca Fornaciari, Francesca
%A Escolano, Carlos
%A Garcia Gilabert, Javier
%A Gamallo, Pablo
%A Mash, Audrey
%A Liao, Xixian
%A Melero, Maite
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Koehn, Philipp
%Y Monz, Christof
%S Proceedings of the Ninth Conference on Machine Translation
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F sant-etal-2024-training
%X In this paper, we present the two strategies employed for the WMT24 Shared Task on Translation into Low-Resource Languages of Spain. We participated in the language pairs of Spanish-to-Aragonese, Spanish-to-Aranese, and Spanish-to-Asturian, developing neural-based translation systems and moving away from rule-based approaches for these language directions. To create these models, two distinct strategies were employed. The first strategy involved a thorough cleaning process and curation of the limited provided data, followed by fine-tuning the multilingual NLLB-200-600M model (Constrained Submission). The other strategy involved training a transformer from scratch using a vast amount of synthetic data (Open Submission). Both approaches relied on generated synthetic data and resulted in high ChrF and BLEU scores. However, given the characteristics of the task, the strategy used in the Constrained Submission resulted in higher scores that surpassed the baselines across the three translation directions, whereas the strategy employed in the Open Submission yielded slightly lower scores than the highest baseline.
%U https://aclanthology.org/2024.wmt-1.90
%P 925-933
Markdown (Informal)
[Training and Fine-Tuning NMT Models for Low-Resource Languages Using Apertium-Based Synthetic Corpora](https://aclanthology.org/2024.wmt-1.90) (Sant et al., WMT 2024)
ACL
- Aleix Sant, Daniel Bardanca, José Ramom Pichel Campos, Francesca De Luca Fornaciari, Carlos Escolano, Javier Garcia Gilabert, Pablo Gamallo, Audrey Mash, Xixian Liao, and Maite Melero. 2024. Training and Fine-Tuning NMT Models for Low-Resource Languages Using Apertium-Based Synthetic Corpora. In Proceedings of the Ninth Conference on Machine Translation, pages 925–933, Miami, Florida, USA. Association for Computational Linguistics.