@inproceedings{briakou-etal-2022-bitextedit,
title = "{B}itext{E}dit: Automatic Bitext Editing for Improved Low-Resource Machine Translation",
author = "Briakou, Eleftheria and
Wang, Sida and
Zettlemoyer, Luke and
Ghazvininejad, Marjan",
editor = "Carpuat, Marine and
de Marneffe, Marie-Catherine and
Meza Ruiz, Ivan Vladimir",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2022",
month = jul,
year = "2022",
address = "Seattle, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-naacl.110",
doi = "10.18653/v1/2022.findings-naacl.110",
pages = "1469--1485",
abstract = "Mined bitexts can contain imperfect translations that yield unreliable training signals for Neural Machine Translation (NMT). While filtering such pairs out is known to improve final model quality, we argue that it is suboptimal in low-resource conditions where even mined data can be limited. In our work, we propose instead, to refine the mined bitexts via automatic editing: given a sentence in a language $x_f$, and a possibly imperfect translation of it $\mathbf{x_e}$, our model generates a revised version $x_f'$ or $x_e'$ that yields a more equivalent translation pair (i.e., {\textless}$x_f, x_e'${\textgreater} or {\textless}$x_f', x_e${\textgreater}). We use a simple editing strategy by (1) mining potentially imperfect translations for each sentence in a given bitext, (2) learning a model to reconstruct the original translations and translate, in a multi-task fashion. Experiments demonstrate that our approach successfully improves the quality of CCMatrix mined bitext for 5 low-resource language-pairs and 10 translation directions by up to 8 BLEU points, in most cases improving upon a competitive translation-based baseline.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="briakou-etal-2022-bitextedit">
<titleInfo>
<title>BitextEdit: Automatic Bitext Editing for Improved Low-Resource Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eleftheria</namePart>
<namePart type="family">Briakou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sida</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luke</namePart>
<namePart type="family">Zettlemoyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marjan</namePart>
<namePart type="family">Ghazvininejad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marine</namePart>
<namePart type="family">Carpuat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Catherine</namePart>
<namePart type="family">de Marneffe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="given">Vladimir</namePart>
<namePart type="family">Meza Ruiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Mined bitexts can contain imperfect translations that yield unreliable training signals for Neural Machine Translation (NMT). While filtering such pairs out is known to improve final model quality, we argue that it is suboptimal in low-resource conditions where even mined data can be limited. In our work, we propose instead, to refine the mined bitexts via automatic editing: given a sentence in a language x_f, and a possibly imperfect translation of it \mathbfx_e, our model generates a revised version x_f’ or x_e’ that yields a more equivalent translation pair (i.e., \textlessx_f, x_e’\textgreater or \textlessx_f’, x_e\textgreater). We use a simple editing strategy by (1) mining potentially imperfect translations for each sentence in a given bitext, (2) learning a model to reconstruct the original translations and translate, in a multi-task fashion. Experiments demonstrate that our approach successfully improves the quality of CCMatrix mined bitext for 5 low-resource language-pairs and 10 translation directions by up to 8 BLEU points, in most cases improving upon a competitive translation-based baseline.</abstract>
<identifier type="citekey">briakou-etal-2022-bitextedit</identifier>
<identifier type="doi">10.18653/v1/2022.findings-naacl.110</identifier>
<location>
<url>https://aclanthology.org/2022.findings-naacl.110</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>1469</start>
<end>1485</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BitextEdit: Automatic Bitext Editing for Improved Low-Resource Machine Translation
%A Briakou, Eleftheria
%A Wang, Sida
%A Zettlemoyer, Luke
%A Ghazvininejad, Marjan
%Y Carpuat, Marine
%Y de Marneffe, Marie-Catherine
%Y Meza Ruiz, Ivan Vladimir
%S Findings of the Association for Computational Linguistics: NAACL 2022
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, United States
%F briakou-etal-2022-bitextedit
%X Mined bitexts can contain imperfect translations that yield unreliable training signals for Neural Machine Translation (NMT). While filtering such pairs out is known to improve final model quality, we argue that it is suboptimal in low-resource conditions where even mined data can be limited. In our work, we propose instead, to refine the mined bitexts via automatic editing: given a sentence in a language x_f, and a possibly imperfect translation of it \mathbfx_e, our model generates a revised version x_f’ or x_e’ that yields a more equivalent translation pair (i.e., \textlessx_f, x_e’\textgreater or \textlessx_f’, x_e\textgreater). We use a simple editing strategy by (1) mining potentially imperfect translations for each sentence in a given bitext, (2) learning a model to reconstruct the original translations and translate, in a multi-task fashion. Experiments demonstrate that our approach successfully improves the quality of CCMatrix mined bitext for 5 low-resource language-pairs and 10 translation directions by up to 8 BLEU points, in most cases improving upon a competitive translation-based baseline.
%R 10.18653/v1/2022.findings-naacl.110
%U https://aclanthology.org/2022.findings-naacl.110
%U https://doi.org/10.18653/v1/2022.findings-naacl.110
%P 1469-1485
Markdown (Informal)
[BitextEdit: Automatic Bitext Editing for Improved Low-Resource Machine Translation](https://aclanthology.org/2022.findings-naacl.110) (Briakou et al., Findings 2022)
ACL