@inproceedings{kambhatla-etal-2022-auxiliary,
title = "Auxiliary Subword Segmentations as Related Languages for Low Resource Multilingual Translation",
author = "Kambhatla, Nishant and
Born, Logan and
Sarkar, Anoop",
editor = {Moniz, Helena and
Macken, Lieve and
Rufener, Andrew and
Barrault, Lo{\"\i}c and
Costa-juss{\`a}, Marta R. and
Declercq, Christophe and
Koponen, Maarit and
Kemp, Ellie and
Pilos, Spyridon and
Forcada, Mikel L. and
Scarton, Carolina and
Van den Bogaert, Joachim and
Daems, Joke and
Tezcan, Arda and
Vanroy, Bram and
Fonteyne, Margot},
booktitle = "Proceedings of the 23rd Annual Conference of the European Association for Machine Translation",
month = jun,
year = "2022",
address = "Ghent, Belgium",
publisher = "European Association for Machine Translation",
url = "https://aclanthology.org/2022.eamt-1.16",
pages = "131--140",
abstract = "We propose a novel technique that combines alternative subword tokenizations of a single source-target language pair that allows us to leverage multilingual neural translation training methods. These alternate segmentations function like related languages in multilingual translation. Overall this improves translation accuracy for low-resource languages and produces translations that are lexically diverse and morphologically rich. We also introduce a cross-teaching technique which yields further improvements in translation accuracy and cross-lingual transfer between high- and low-resource language pairs. Compared to other strong multilingual baselines, our approach yields average gains of +1.7 BLEU across the four low-resource datasets from the multilingual TED-talks dataset. Our technique does not require additional training data and is a drop-in improvement for any existing neural translation system.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kambhatla-etal-2022-auxiliary">
<titleInfo>
<title>Auxiliary Subword Segmentations as Related Languages for Low Resource Multilingual Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nishant</namePart>
<namePart type="family">Kambhatla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Logan</namePart>
<namePart type="family">Born</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Sarkar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd Annual Conference of the European Association for Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Moniz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lieve</namePart>
<namePart type="family">Macken</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrew</namePart>
<namePart type="family">Rufener</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Loïc</namePart>
<namePart type="family">Barrault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Costa-jussà</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christophe</namePart>
<namePart type="family">Declercq</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maarit</namePart>
<namePart type="family">Koponen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ellie</namePart>
<namePart type="family">Kemp</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Spyridon</namePart>
<namePart type="family">Pilos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikel</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Forcada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolina</namePart>
<namePart type="family">Scarton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joachim</namePart>
<namePart type="family">Van den Bogaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joke</namePart>
<namePart type="family">Daems</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arda</namePart>
<namePart type="family">Tezcan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bram</namePart>
<namePart type="family">Vanroy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Margot</namePart>
<namePart type="family">Fonteyne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Association for Machine Translation</publisher>
<place>
<placeTerm type="text">Ghent, Belgium</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We propose a novel technique that combines alternative subword tokenizations of a single source-target language pair that allows us to leverage multilingual neural translation training methods. These alternate segmentations function like related languages in multilingual translation. Overall this improves translation accuracy for low-resource languages and produces translations that are lexically diverse and morphologically rich. We also introduce a cross-teaching technique which yields further improvements in translation accuracy and cross-lingual transfer between high- and low-resource language pairs. Compared to other strong multilingual baselines, our approach yields average gains of +1.7 BLEU across the four low-resource datasets from the multilingual TED-talks dataset. Our technique does not require additional training data and is a drop-in improvement for any existing neural translation system.</abstract>
<identifier type="citekey">kambhatla-etal-2022-auxiliary</identifier>
<location>
<url>https://aclanthology.org/2022.eamt-1.16</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>131</start>
<end>140</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Auxiliary Subword Segmentations as Related Languages for Low Resource Multilingual Translation
%A Kambhatla, Nishant
%A Born, Logan
%A Sarkar, Anoop
%Y Moniz, Helena
%Y Macken, Lieve
%Y Rufener, Andrew
%Y Barrault, Loïc
%Y Costa-jussà, Marta R.
%Y Declercq, Christophe
%Y Koponen, Maarit
%Y Kemp, Ellie
%Y Pilos, Spyridon
%Y Forcada, Mikel L.
%Y Scarton, Carolina
%Y Van den Bogaert, Joachim
%Y Daems, Joke
%Y Tezcan, Arda
%Y Vanroy, Bram
%Y Fonteyne, Margot
%S Proceedings of the 23rd Annual Conference of the European Association for Machine Translation
%D 2022
%8 June
%I European Association for Machine Translation
%C Ghent, Belgium
%F kambhatla-etal-2022-auxiliary
%X We propose a novel technique that combines alternative subword tokenizations of a single source-target language pair that allows us to leverage multilingual neural translation training methods. These alternate segmentations function like related languages in multilingual translation. Overall this improves translation accuracy for low-resource languages and produces translations that are lexically diverse and morphologically rich. We also introduce a cross-teaching technique which yields further improvements in translation accuracy and cross-lingual transfer between high- and low-resource language pairs. Compared to other strong multilingual baselines, our approach yields average gains of +1.7 BLEU across the four low-resource datasets from the multilingual TED-talks dataset. Our technique does not require additional training data and is a drop-in improvement for any existing neural translation system.
%U https://aclanthology.org/2022.eamt-1.16
%P 131-140
Markdown (Informal)
[Auxiliary Subword Segmentations as Related Languages for Low Resource Multilingual Translation](https://aclanthology.org/2022.eamt-1.16) (Kambhatla et al., EAMT 2022)
ACL