@inproceedings{dandapat-federmann-2018-iterative,
title = "Iterative Data Augmentation for Neural Machine Translation: a Low Resource Case Study for {E}nglish-{T}elugu",
author = "Dandapat, Sandipan and
Federmann, Christian",
editor = "P{\'e}rez-Ortiz, Juan Antonio and
S{\'a}nchez-Mart{\'\i}nez, Felipe and
Espl{\`a}-Gomis, Miquel and
Popovi{\'c}, Maja and
Rico, Celia and
Martins, Andr{\'e} and
Van den Bogaert, Joachim and
Forcada, Mikel L.",
booktitle = "Proceedings of the 21st Annual Conference of the European Association for Machine Translation",
month = may,
year = "2018",
address = "Alicante, Spain",
url = "https://aclanthology.org/2018.eamt-main.29",
pages = "307--312",
abstract = "Telugu is the fifteenth most commonly spoken language in the world with an estimated reach of 75 million people in the Indian subcontinent. At the same time, it is a severely low resourced language. In this paper, we present work on English{--}Telugu general domain machine translation (MT) systems using small amounts of parallel data. The baseline statistical (SMT) and neural MT (NMT) systems do not yield acceptable translation quality, mostly due to limited resources. However, the use of synthetic parallel data (generated using back translation, based on an NMT engine) significantly improves translation quality and allows NMT to outperform SMT. We extend back translation and propose a new, iterative data augmentation (IDA) method. Filtering of synthetic data and IDA both further boost translation quality of our final NMT systems, as measured by BLEU scores on all test sets and based on state-of-the-art human evaluation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dandapat-federmann-2018-iterative">
<titleInfo>
<title>Iterative Data Augmentation for Neural Machine Translation: a Low Resource Case Study for English-Telugu</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sandipan</namePart>
<namePart type="family">Dandapat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Federmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st Annual Conference of the European Association for Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">Antonio</namePart>
<namePart type="family">Pérez-Ortiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felipe</namePart>
<namePart type="family">Sánchez-Martínez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miquel</namePart>
<namePart type="family">Esplà-Gomis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maja</namePart>
<namePart type="family">Popović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Celia</namePart>
<namePart type="family">Rico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">André</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joachim</namePart>
<namePart type="family">Van den Bogaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikel</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Forcada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<place>
<placeTerm type="text">Alicante, Spain</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Telugu is the fifteenth most commonly spoken language in the world with an estimated reach of 75 million people in the Indian subcontinent. At the same time, it is a severely low resourced language. In this paper, we present work on English–Telugu general domain machine translation (MT) systems using small amounts of parallel data. The baseline statistical (SMT) and neural MT (NMT) systems do not yield acceptable translation quality, mostly due to limited resources. However, the use of synthetic parallel data (generated using back translation, based on an NMT engine) significantly improves translation quality and allows NMT to outperform SMT. We extend back translation and propose a new, iterative data augmentation (IDA) method. Filtering of synthetic data and IDA both further boost translation quality of our final NMT systems, as measured by BLEU scores on all test sets and based on state-of-the-art human evaluation.</abstract>
<identifier type="citekey">dandapat-federmann-2018-iterative</identifier>
<location>
<url>https://aclanthology.org/2018.eamt-main.29</url>
</location>
<part>
<date>2018-05</date>
<extent unit="page">
<start>307</start>
<end>312</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Iterative Data Augmentation for Neural Machine Translation: a Low Resource Case Study for English-Telugu
%A Dandapat, Sandipan
%A Federmann, Christian
%Y Pérez-Ortiz, Juan Antonio
%Y Sánchez-Martínez, Felipe
%Y Esplà-Gomis, Miquel
%Y Popović, Maja
%Y Rico, Celia
%Y Martins, André
%Y Van den Bogaert, Joachim
%Y Forcada, Mikel L.
%S Proceedings of the 21st Annual Conference of the European Association for Machine Translation
%D 2018
%8 May
%C Alicante, Spain
%F dandapat-federmann-2018-iterative
%X Telugu is the fifteenth most commonly spoken language in the world with an estimated reach of 75 million people in the Indian subcontinent. At the same time, it is a severely low resourced language. In this paper, we present work on English–Telugu general domain machine translation (MT) systems using small amounts of parallel data. The baseline statistical (SMT) and neural MT (NMT) systems do not yield acceptable translation quality, mostly due to limited resources. However, the use of synthetic parallel data (generated using back translation, based on an NMT engine) significantly improves translation quality and allows NMT to outperform SMT. We extend back translation and propose a new, iterative data augmentation (IDA) method. Filtering of synthetic data and IDA both further boost translation quality of our final NMT systems, as measured by BLEU scores on all test sets and based on state-of-the-art human evaluation.
%U https://aclanthology.org/2018.eamt-main.29
%P 307-312
Markdown (Informal)
[Iterative Data Augmentation for Neural Machine Translation: a Low Resource Case Study for English-Telugu](https://aclanthology.org/2018.eamt-main.29) (Dandapat & Federmann, EAMT 2018)
ACL