@inproceedings{alnajjar-etal-2023-bootstrapping,
title = "Bootstrapping {M}oksha-{E}rzya Neural Machine Translation from Rule-Based Apertium",
author = {Alnajjar, Khalid and
H{\"a}m{\"a}l{\"a}inen, Mika and
Rueter, Jack},
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
{\"O}hman, Emily and
Pirinen, Flammie and
Alnajjar, Khalid and
Miyagawa, So and
Bizzoni, Yuri and
Partanen, Niko and
Rueter, Jack},
booktitle = "Proceedings of the Joint 3rd International Conference on Natural Language Processing for Digital Humanities and 8th International Workshop on Computational Linguistics for Uralic Languages",
month = dec,
year = "2023",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.nlp4dh-1.24/",
pages = "213--218",
abstract = "Neural Machine Translation (NMT) has made significant strides in breaking down language barriers around the globe. For lesser-resourced languages like Moksha and Erzya, however, the development of robust NMT systems remains a challenge due to the scarcity of parallel corpora. This paper presents a novel approach to address this challenge by leveraging the existing rule-based machine translation system Apertium as a tool for synthetic data generation. We fine-tune NLLB-200 for Moksha-Erzya translation and obtain a BLEU of 0.73 on the Apertium generated data. On real world data, we got an improvement of 0.058 BLEU score over Apertium."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alnajjar-etal-2023-bootstrapping">
<titleInfo>
<title>Bootstrapping Moksha-Erzya Neural Machine Translation from Rule-Based Apertium</title>
</titleInfo>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Alnajjar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jack</namePart>
<namePart type="family">Rueter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Joint 3rd International Conference on Natural Language Processing for Digital Humanities and 8th International Workshop on Computational Linguistics for Uralic Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Öhman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Flammie</namePart>
<namePart type="family">Pirinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Alnajjar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">So</namePart>
<namePart type="family">Miyagawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niko</namePart>
<namePart type="family">Partanen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jack</namePart>
<namePart type="family">Rueter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Tokyo, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Neural Machine Translation (NMT) has made significant strides in breaking down language barriers around the globe. For lesser-resourced languages like Moksha and Erzya, however, the development of robust NMT systems remains a challenge due to the scarcity of parallel corpora. This paper presents a novel approach to address this challenge by leveraging the existing rule-based machine translation system Apertium as a tool for synthetic data generation. We fine-tune NLLB-200 for Moksha-Erzya translation and obtain a BLEU of 0.73 on the Apertium generated data. On real world data, we got an improvement of 0.058 BLEU score over Apertium.</abstract>
<identifier type="citekey">alnajjar-etal-2023-bootstrapping</identifier>
<location>
<url>https://aclanthology.org/2023.nlp4dh-1.24/</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>213</start>
<end>218</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bootstrapping Moksha-Erzya Neural Machine Translation from Rule-Based Apertium
%A Alnajjar, Khalid
%A Hämäläinen, Mika
%A Rueter, Jack
%Y Hämäläinen, Mika
%Y Öhman, Emily
%Y Pirinen, Flammie
%Y Alnajjar, Khalid
%Y Miyagawa, So
%Y Bizzoni, Yuri
%Y Partanen, Niko
%Y Rueter, Jack
%S Proceedings of the Joint 3rd International Conference on Natural Language Processing for Digital Humanities and 8th International Workshop on Computational Linguistics for Uralic Languages
%D 2023
%8 December
%I Association for Computational Linguistics
%C Tokyo, Japan
%F alnajjar-etal-2023-bootstrapping
%X Neural Machine Translation (NMT) has made significant strides in breaking down language barriers around the globe. For lesser-resourced languages like Moksha and Erzya, however, the development of robust NMT systems remains a challenge due to the scarcity of parallel corpora. This paper presents a novel approach to address this challenge by leveraging the existing rule-based machine translation system Apertium as a tool for synthetic data generation. We fine-tune NLLB-200 for Moksha-Erzya translation and obtain a BLEU of 0.73 on the Apertium generated data. On real world data, we got an improvement of 0.058 BLEU score over Apertium.
%U https://aclanthology.org/2023.nlp4dh-1.24/
%P 213-218
Markdown (Informal)
[Bootstrapping Moksha-Erzya Neural Machine Translation from Rule-Based Apertium](https://aclanthology.org/2023.nlp4dh-1.24/) (Alnajjar et al., NLP4DH-IWCLUL 2023)
ACL
- Khalid Alnajjar, Mika Hämäläinen, and Jack Rueter. 2023. Bootstrapping Moksha-Erzya Neural Machine Translation from Rule-Based Apertium. In Proceedings of the Joint 3rd International Conference on Natural Language Processing for Digital Humanities and 8th International Workshop on Computational Linguistics for Uralic Languages, pages 213–218, Tokyo, Japan. Association for Computational Linguistics.