@inproceedings{dandapat-lewis-2018-training,
title = "Training Deployable General Domain {MT} for a Low Resource Language Pair: {E}nglish-{B}angla",
author = "Dandapat, Sandipan and
Lewis, William",
editor = "P{\'e}rez-Ortiz, Juan Antonio and
S{\'a}nchez-Mart{\'i}nez, Felipe and
Espl{\`a}-Gomis, Miquel and
Popovi{\'c}, Maja and
Rico, Celia and
Martins, Andr{\'e} and
Van den Bogaert, Joachim and
Forcada, Mikel L.",
booktitle = "Proceedings of the 21st Annual Conference of the European Association for Machine Translation",
month = may,
year = "2018",
address = "Alicante, Spain",
url = "https://aclanthology.org/2018.eamt-main.11/",
pages = "129--138",
abstract = "A large percentage of the world`s population speaks a language of the Indian subcontinent, what we will call here Indic languages, comprising languages from both Indo-European (e.g., Hindi, Bangla, Gujarati, etc.) and Dravidian (e.g., Tamil, Telugu, Malayalam, etc.) families, upwards of 1.5 Billion people. A universal characteristic of Indic languages is their complex morphology, which, when combined with the general lack of sufficient quantities of high quality parallel data, can make developing machine translation (MT) for these languages difficult. In this paper, we describe our efforts towards developing general domain English{--}Bangla MT systems which are deployable to the Web. We initially developed and deployed SMT-based systems, but over time migrated to NMT-based systems. Our initial SMT-based systems had reasonably good BLEU scores, however, using NMT systems, we have gained significant improvement over SMT baselines. This is achieved using a number of ideas to boost the data store and counter data sparsity: crowd translation of intelligently selected monolingual data (throughput enhanced by an IME (Input Method Editor) designed specifically for QWERTY keyboard entry for Devanagari scripted languages), back-translation, different regularization techniques, dataset augmentation and early stopping."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dandapat-lewis-2018-training">
<titleInfo>
<title>Training Deployable General Domain MT for a Low Resource Language Pair: English-Bangla</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sandipan</namePart>
<namePart type="family">Dandapat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="family">Lewis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st Annual Conference of the European Association for Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">Antonio</namePart>
<namePart type="family">Pérez-Ortiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felipe</namePart>
<namePart type="family">Sánchez-Martínez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miquel</namePart>
<namePart type="family">Esplà-Gomis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maja</namePart>
<namePart type="family">Popović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Celia</namePart>
<namePart type="family">Rico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">André</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joachim</namePart>
<namePart type="family">Van den Bogaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikel</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Forcada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<place>
<placeTerm type="text">Alicante, Spain</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A large percentage of the world‘s population speaks a language of the Indian subcontinent, what we will call here Indic languages, comprising languages from both Indo-European (e.g., Hindi, Bangla, Gujarati, etc.) and Dravidian (e.g., Tamil, Telugu, Malayalam, etc.) families, upwards of 1.5 Billion people. A universal characteristic of Indic languages is their complex morphology, which, when combined with the general lack of sufficient quantities of high quality parallel data, can make developing machine translation (MT) for these languages difficult. In this paper, we describe our efforts towards developing general domain English–Bangla MT systems which are deployable to the Web. We initially developed and deployed SMT-based systems, but over time migrated to NMT-based systems. Our initial SMT-based systems had reasonably good BLEU scores, however, using NMT systems, we have gained significant improvement over SMT baselines. This is achieved using a number of ideas to boost the data store and counter data sparsity: crowd translation of intelligently selected monolingual data (throughput enhanced by an IME (Input Method Editor) designed specifically for QWERTY keyboard entry for Devanagari scripted languages), back-translation, different regularization techniques, dataset augmentation and early stopping.</abstract>
<identifier type="citekey">dandapat-lewis-2018-training</identifier>
<location>
<url>https://aclanthology.org/2018.eamt-main.11/</url>
</location>
<part>
<date>2018-05</date>
<extent unit="page">
<start>129</start>
<end>138</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Training Deployable General Domain MT for a Low Resource Language Pair: English-Bangla
%A Dandapat, Sandipan
%A Lewis, William
%Y Pérez-Ortiz, Juan Antonio
%Y Sánchez-Martínez, Felipe
%Y Esplà-Gomis, Miquel
%Y Popović, Maja
%Y Rico, Celia
%Y Martins, André
%Y Van den Bogaert, Joachim
%Y Forcada, Mikel L.
%S Proceedings of the 21st Annual Conference of the European Association for Machine Translation
%D 2018
%8 May
%C Alicante, Spain
%F dandapat-lewis-2018-training
%X A large percentage of the world‘s population speaks a language of the Indian subcontinent, what we will call here Indic languages, comprising languages from both Indo-European (e.g., Hindi, Bangla, Gujarati, etc.) and Dravidian (e.g., Tamil, Telugu, Malayalam, etc.) families, upwards of 1.5 Billion people. A universal characteristic of Indic languages is their complex morphology, which, when combined with the general lack of sufficient quantities of high quality parallel data, can make developing machine translation (MT) for these languages difficult. In this paper, we describe our efforts towards developing general domain English–Bangla MT systems which are deployable to the Web. We initially developed and deployed SMT-based systems, but over time migrated to NMT-based systems. Our initial SMT-based systems had reasonably good BLEU scores, however, using NMT systems, we have gained significant improvement over SMT baselines. This is achieved using a number of ideas to boost the data store and counter data sparsity: crowd translation of intelligently selected monolingual data (throughput enhanced by an IME (Input Method Editor) designed specifically for QWERTY keyboard entry for Devanagari scripted languages), back-translation, different regularization techniques, dataset augmentation and early stopping.
%U https://aclanthology.org/2018.eamt-main.11/
%P 129-138
Markdown (Informal)
[Training Deployable General Domain MT for a Low Resource Language Pair: English-Bangla](https://aclanthology.org/2018.eamt-main.11/) (Dandapat & Lewis, EAMT 2018)
ACL