@inproceedings{ruiter-etal-2021-integrating,
title = "Integrating Unsupervised Data Generation into Self-Supervised Neural Machine Translation for Low-Resource Languages",
author = "Ruiter, Dana and
Klakow, Dietrich and
van Genabith, Josef and
Espa{\~n}a-Bonet, Cristina",
editor = "Duh, Kevin and
Guzm{\'a}n, Francisco",
booktitle = "Proceedings of Machine Translation Summit XVIII: Research Track",
month = aug,
year = "2021",
address = "Virtual",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2021.mtsummit-research.7",
pages = "76--91",
abstract = "For most language combinations and parallel data is either scarce or simply unavailable. To address this and unsupervised machine translation (UMT) exploits large amounts of monolingual data by using synthetic data generation techniques such as back-translation and noising and while self-supervised NMT (SSNMT) identifies parallel sentences in smaller comparable data and trains on them. To this date and the inclusion of UMT data generation techniques in SSNMT has not been investigated. We show that including UMT techniques into SSNMT significantly outperforms SSNMT (up to +4.3 BLEU and af2en) as well as statistical (+50.8 BLEU) and hybrid UMT (+51.5 BLEU) baselines on related and distantly-related and unrelated language pairs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ruiter-etal-2021-integrating">
<titleInfo>
<title>Integrating Unsupervised Data Generation into Self-Supervised Neural Machine Translation for Low-Resource Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dana</namePart>
<namePart type="family">Ruiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dietrich</namePart>
<namePart type="family">Klakow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Josef</namePart>
<namePart type="family">van Genabith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cristina</namePart>
<namePart type="family">España-Bonet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Machine Translation Summit XVIII: Research Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francisco</namePart>
<namePart type="family">Guzmán</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">Virtual</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>For most language combinations and parallel data is either scarce or simply unavailable. To address this and unsupervised machine translation (UMT) exploits large amounts of monolingual data by using synthetic data generation techniques such as back-translation and noising and while self-supervised NMT (SSNMT) identifies parallel sentences in smaller comparable data and trains on them. To this date and the inclusion of UMT data generation techniques in SSNMT has not been investigated. We show that including UMT techniques into SSNMT significantly outperforms SSNMT (up to +4.3 BLEU and af2en) as well as statistical (+50.8 BLEU) and hybrid UMT (+51.5 BLEU) baselines on related and distantly-related and unrelated language pairs.</abstract>
<identifier type="citekey">ruiter-etal-2021-integrating</identifier>
<location>
<url>https://aclanthology.org/2021.mtsummit-research.7</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>76</start>
<end>91</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Integrating Unsupervised Data Generation into Self-Supervised Neural Machine Translation for Low-Resource Languages
%A Ruiter, Dana
%A Klakow, Dietrich
%A van Genabith, Josef
%A España-Bonet, Cristina
%Y Duh, Kevin
%Y Guzmán, Francisco
%S Proceedings of Machine Translation Summit XVIII: Research Track
%D 2021
%8 August
%I Association for Machine Translation in the Americas
%C Virtual
%F ruiter-etal-2021-integrating
%X For most language combinations and parallel data is either scarce or simply unavailable. To address this and unsupervised machine translation (UMT) exploits large amounts of monolingual data by using synthetic data generation techniques such as back-translation and noising and while self-supervised NMT (SSNMT) identifies parallel sentences in smaller comparable data and trains on them. To this date and the inclusion of UMT data generation techniques in SSNMT has not been investigated. We show that including UMT techniques into SSNMT significantly outperforms SSNMT (up to +4.3 BLEU and af2en) as well as statistical (+50.8 BLEU) and hybrid UMT (+51.5 BLEU) baselines on related and distantly-related and unrelated language pairs.
%U https://aclanthology.org/2021.mtsummit-research.7
%P 76-91
Markdown (Informal)
[Integrating Unsupervised Data Generation into Self-Supervised Neural Machine Translation for Low-Resource Languages](https://aclanthology.org/2021.mtsummit-research.7) (Ruiter et al., MTSummit 2021)
ACL