@inproceedings{edman-etal-2020-low,
title = "Low-Resource Unsupervised {NMT}: Diagnosing the Problem and Providing a Linguistically Motivated Solution",
author = "Edman, Lukas and
Toral, Antonio and
van Noord, Gertjan",
editor = "Martins, Andr{\'e} and
Moniz, Helena and
Fumega, Sara and
Martins, Bruno and
Batista, Fernando and
Coheur, Luisa and
Parra, Carla and
Trancoso, Isabel and
Turchi, Marco and
Bisazza, Arianna and
Moorkens, Joss and
Guerberof, Ana and
Nurminen, Mary and
Marg, Lena and
Forcada, Mikel L.",
booktitle = "Proceedings of the 22nd Annual Conference of the European Association for Machine Translation",
month = nov,
year = "2020",
address = "Lisboa, Portugal",
publisher = "European Association for Machine Translation",
url = "https://aclanthology.org/2020.eamt-1.10/",
pages = "81--90",
abstract = "Unsupervised Machine Translation has been advancing our ability to translate without parallel data, but state-of-the-art methods assume an abundance of monolingual data. This paper investigates the scenario where monolingual data is limited as well, finding that current unsupervised methods suffer in performance under this stricter setting. We find that the performance loss originates from the poor quality of the pretrained monolingual embeddings, and we offer a potential solution: dependency-based word embeddings. These embeddings result in a complementary word representation which offers a boost in performance of around 1.5 BLEU points compared to standard word2vec when monolingual data is limited to 1 million sentences per language. We also find that the inclusion of sub-word information is crucial to improving the quality of the embeddings."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="edman-etal-2020-low">
<titleInfo>
<title>Low-Resource Unsupervised NMT: Diagnosing the Problem and Providing a Linguistically Motivated Solution</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lukas</namePart>
<namePart type="family">Edman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Toral</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gertjan</namePart>
<namePart type="family">van Noord</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd Annual Conference of the European Association for Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">André</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Moniz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Fumega</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bruno</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fernando</namePart>
<namePart type="family">Batista</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luisa</namePart>
<namePart type="family">Coheur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carla</namePart>
<namePart type="family">Parra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabel</namePart>
<namePart type="family">Trancoso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Turchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arianna</namePart>
<namePart type="family">Bisazza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joss</namePart>
<namePart type="family">Moorkens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ana</namePart>
<namePart type="family">Guerberof</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mary</namePart>
<namePart type="family">Nurminen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lena</namePart>
<namePart type="family">Marg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikel</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Forcada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Association for Machine Translation</publisher>
<place>
<placeTerm type="text">Lisboa, Portugal</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Unsupervised Machine Translation has been advancing our ability to translate without parallel data, but state-of-the-art methods assume an abundance of monolingual data. This paper investigates the scenario where monolingual data is limited as well, finding that current unsupervised methods suffer in performance under this stricter setting. We find that the performance loss originates from the poor quality of the pretrained monolingual embeddings, and we offer a potential solution: dependency-based word embeddings. These embeddings result in a complementary word representation which offers a boost in performance of around 1.5 BLEU points compared to standard word2vec when monolingual data is limited to 1 million sentences per language. We also find that the inclusion of sub-word information is crucial to improving the quality of the embeddings.</abstract>
<identifier type="citekey">edman-etal-2020-low</identifier>
<location>
<url>https://aclanthology.org/2020.eamt-1.10/</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>81</start>
<end>90</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Low-Resource Unsupervised NMT: Diagnosing the Problem and Providing a Linguistically Motivated Solution
%A Edman, Lukas
%A Toral, Antonio
%A van Noord, Gertjan
%Y Martins, André
%Y Moniz, Helena
%Y Fumega, Sara
%Y Martins, Bruno
%Y Batista, Fernando
%Y Coheur, Luisa
%Y Parra, Carla
%Y Trancoso, Isabel
%Y Turchi, Marco
%Y Bisazza, Arianna
%Y Moorkens, Joss
%Y Guerberof, Ana
%Y Nurminen, Mary
%Y Marg, Lena
%Y Forcada, Mikel L.
%S Proceedings of the 22nd Annual Conference of the European Association for Machine Translation
%D 2020
%8 November
%I European Association for Machine Translation
%C Lisboa, Portugal
%F edman-etal-2020-low
%X Unsupervised Machine Translation has been advancing our ability to translate without parallel data, but state-of-the-art methods assume an abundance of monolingual data. This paper investigates the scenario where monolingual data is limited as well, finding that current unsupervised methods suffer in performance under this stricter setting. We find that the performance loss originates from the poor quality of the pretrained monolingual embeddings, and we offer a potential solution: dependency-based word embeddings. These embeddings result in a complementary word representation which offers a boost in performance of around 1.5 BLEU points compared to standard word2vec when monolingual data is limited to 1 million sentences per language. We also find that the inclusion of sub-word information is crucial to improving the quality of the embeddings.
%U https://aclanthology.org/2020.eamt-1.10/
%P 81-90
Markdown (Informal)
[Low-Resource Unsupervised NMT: Diagnosing the Problem and Providing a Linguistically Motivated Solution](https://aclanthology.org/2020.eamt-1.10/) (Edman et al., EAMT 2020)
ACL