@inproceedings{birch-etal-2021-surprise,
title = "Surprise Language Challenge: Developing a Neural Machine Translation System between {P}ashto and {E}nglish in Two Months",
author = "Birch, Alexandra and
Haddow, Barry and
Valerio Miceli Barone, Antonio and
Helcl, Jindrich and
Waldendorf, Jonas and
S{\'a}nchez Mart{\'\i}nez, Felipe and
Forcada, Mikel and
S{\'a}nchez Cartagena, V{\'\i}ctor and
P{\'e}rez-Ortiz, Juan Antonio and
Espl{\`a}-Gomis, Miquel and
Aziz, Wilker and
Murady, Lina and
Sariisik, Sevi and
van der Kreeft, Peggy and
Macquarrie, Kay",
editor = "Duh, Kevin and
Guzm{\'a}n, Francisco",
booktitle = "Proceedings of Machine Translation Summit XVIII: Research Track",
month = aug,
year = "2021",
address = "Virtual",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2021.mtsummit-research.8",
pages = "92--102",
abstract = "In the media industry and the focus of global reporting can shift overnight. There is a compelling need to be able to develop new machine translation systems in a short period of time and in order to more efficiently cover quickly developing stories. As part of the EU project GoURMET and which focusses on low-resource machine translation and our media partners selected a surprise language for which a machine translation system had to be built and evaluated in two months(February and March 2021). The language selected was Pashto and an Indo-Iranian language spoken in Afghanistan and Pakistan and India. In this period we completed the full pipeline of development of a neural machine translation system: data crawling and cleaning and aligning and creating test sets and developing and testing models and and delivering them to the user partners. In this paperwe describe rapid data creation and experiments with transfer learning and pretraining for this low-resource language pair. We find that starting from an existing large model pre-trained on 50languages leads to far better BLEU scores than pretraining on one high-resource language pair with a smaller model. We also present human evaluation of our systems and which indicates that the resulting systems perform better than a freely available commercial system when translating from English into Pashto direction and and similarly when translating from Pashto into English.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="birch-etal-2021-surprise">
<titleInfo>
<title>Surprise Language Challenge: Developing a Neural Machine Translation System between Pashto and English in Two Months</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alexandra</namePart>
<namePart type="family">Birch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Valerio Miceli Barone</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jindrich</namePart>
<namePart type="family">Helcl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonas</namePart>
<namePart type="family">Waldendorf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felipe</namePart>
<namePart type="family">Sánchez Martínez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikel</namePart>
<namePart type="family">Forcada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Víctor</namePart>
<namePart type="family">Sánchez Cartagena</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">Antonio</namePart>
<namePart type="family">Pérez-Ortiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miquel</namePart>
<namePart type="family">Esplà-Gomis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wilker</namePart>
<namePart type="family">Aziz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lina</namePart>
<namePart type="family">Murady</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sevi</namePart>
<namePart type="family">Sariisik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peggy</namePart>
<namePart type="family">van der Kreeft</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kay</namePart>
<namePart type="family">Macquarrie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Machine Translation Summit XVIII: Research Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francisco</namePart>
<namePart type="family">Guzmán</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">Virtual</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In the media industry and the focus of global reporting can shift overnight. There is a compelling need to be able to develop new machine translation systems in a short period of time and in order to more efficiently cover quickly developing stories. As part of the EU project GoURMET and which focusses on low-resource machine translation and our media partners selected a surprise language for which a machine translation system had to be built and evaluated in two months(February and March 2021). The language selected was Pashto and an Indo-Iranian language spoken in Afghanistan and Pakistan and India. In this period we completed the full pipeline of development of a neural machine translation system: data crawling and cleaning and aligning and creating test sets and developing and testing models and and delivering them to the user partners. In this paperwe describe rapid data creation and experiments with transfer learning and pretraining for this low-resource language pair. We find that starting from an existing large model pre-trained on 50languages leads to far better BLEU scores than pretraining on one high-resource language pair with a smaller model. We also present human evaluation of our systems and which indicates that the resulting systems perform better than a freely available commercial system when translating from English into Pashto direction and and similarly when translating from Pashto into English.</abstract>
<identifier type="citekey">birch-etal-2021-surprise</identifier>
<location>
<url>https://aclanthology.org/2021.mtsummit-research.8</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>92</start>
<end>102</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Surprise Language Challenge: Developing a Neural Machine Translation System between Pashto and English in Two Months
%A Birch, Alexandra
%A Haddow, Barry
%A Valerio Miceli Barone, Antonio
%A Helcl, Jindrich
%A Waldendorf, Jonas
%A Sánchez Martínez, Felipe
%A Forcada, Mikel
%A Sánchez Cartagena, Víctor
%A Pérez-Ortiz, Juan Antonio
%A Esplà-Gomis, Miquel
%A Aziz, Wilker
%A Murady, Lina
%A Sariisik, Sevi
%A van der Kreeft, Peggy
%A Macquarrie, Kay
%Y Duh, Kevin
%Y Guzmán, Francisco
%S Proceedings of Machine Translation Summit XVIII: Research Track
%D 2021
%8 August
%I Association for Machine Translation in the Americas
%C Virtual
%F birch-etal-2021-surprise
%X In the media industry and the focus of global reporting can shift overnight. There is a compelling need to be able to develop new machine translation systems in a short period of time and in order to more efficiently cover quickly developing stories. As part of the EU project GoURMET and which focusses on low-resource machine translation and our media partners selected a surprise language for which a machine translation system had to be built and evaluated in two months(February and March 2021). The language selected was Pashto and an Indo-Iranian language spoken in Afghanistan and Pakistan and India. In this period we completed the full pipeline of development of a neural machine translation system: data crawling and cleaning and aligning and creating test sets and developing and testing models and and delivering them to the user partners. In this paperwe describe rapid data creation and experiments with transfer learning and pretraining for this low-resource language pair. We find that starting from an existing large model pre-trained on 50languages leads to far better BLEU scores than pretraining on one high-resource language pair with a smaller model. We also present human evaluation of our systems and which indicates that the resulting systems perform better than a freely available commercial system when translating from English into Pashto direction and and similarly when translating from Pashto into English.
%U https://aclanthology.org/2021.mtsummit-research.8
%P 92-102
Markdown (Informal)
[Surprise Language Challenge: Developing a Neural Machine Translation System between Pashto and English in Two Months](https://aclanthology.org/2021.mtsummit-research.8) (Birch et al., MTSummit 2021)
ACL
- Alexandra Birch, Barry Haddow, Antonio Valerio Miceli Barone, Jindrich Helcl, Jonas Waldendorf, Felipe Sánchez Martínez, Mikel Forcada, Víctor Sánchez Cartagena, Juan Antonio Pérez-Ortiz, Miquel Esplà-Gomis, Wilker Aziz, Lina Murady, Sevi Sariisik, Peggy van der Kreeft, and Kay Macquarrie. 2021. Surprise Language Challenge: Developing a Neural Machine Translation System between Pashto and English in Two Months. In Proceedings of Machine Translation Summit XVIII: Research Track, pages 92–102, Virtual. Association for Machine Translation in the Americas.