@inproceedings{schwenk-2018-filtering,
title = "Filtering and Mining Parallel Data in a Joint Multilingual Space",
author = "Schwenk, Holger",
editor = "Gurevych, Iryna and
Miyao, Yusuke",
booktitle = "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = jul,
year = "2018",
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P18-2037",
doi = "10.18653/v1/P18-2037",
pages = "228--234",
abstract = "We learn a joint multilingual sentence embedding and use the distance between sentences in different languages to filter noisy parallel data and to mine for parallel data in large news collections. We are able to improve a competitive baseline on the WMT{'}14 English to German task by 0.3 BLEU by filtering out 25{\%} of the training data. The same approach is used to mine additional bitexts for the WMT{'}14 system and to obtain competitive results on the BUCC shared task to identify parallel sentences in comparable corpora. The approach is generic, it can be applied to many language pairs and it is independent of the architecture of the machine translation system.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="schwenk-2018-filtering">
<titleInfo>
<title>Filtering and Mining Parallel Data in a Joint Multilingual Space</title>
</titleInfo>
<name type="personal">
<namePart type="given">Holger</namePart>
<namePart type="family">Schwenk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Iryna</namePart>
<namePart type="family">Gurevych</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yusuke</namePart>
<namePart type="family">Miyao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Melbourne, Australia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We learn a joint multilingual sentence embedding and use the distance between sentences in different languages to filter noisy parallel data and to mine for parallel data in large news collections. We are able to improve a competitive baseline on the WMT’14 English to German task by 0.3 BLEU by filtering out 25% of the training data. The same approach is used to mine additional bitexts for the WMT’14 system and to obtain competitive results on the BUCC shared task to identify parallel sentences in comparable corpora. The approach is generic, it can be applied to many language pairs and it is independent of the architecture of the machine translation system.</abstract>
<identifier type="citekey">schwenk-2018-filtering</identifier>
<identifier type="doi">10.18653/v1/P18-2037</identifier>
<location>
<url>https://aclanthology.org/P18-2037</url>
</location>
<part>
<date>2018-07</date>
<extent unit="page">
<start>228</start>
<end>234</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Filtering and Mining Parallel Data in a Joint Multilingual Space
%A Schwenk, Holger
%Y Gurevych, Iryna
%Y Miyao, Yusuke
%S Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2018
%8 July
%I Association for Computational Linguistics
%C Melbourne, Australia
%F schwenk-2018-filtering
%X We learn a joint multilingual sentence embedding and use the distance between sentences in different languages to filter noisy parallel data and to mine for parallel data in large news collections. We are able to improve a competitive baseline on the WMT’14 English to German task by 0.3 BLEU by filtering out 25% of the training data. The same approach is used to mine additional bitexts for the WMT’14 system and to obtain competitive results on the BUCC shared task to identify parallel sentences in comparable corpora. The approach is generic, it can be applied to many language pairs and it is independent of the architecture of the machine translation system.
%R 10.18653/v1/P18-2037
%U https://aclanthology.org/P18-2037
%U https://doi.org/10.18653/v1/P18-2037
%P 228-234
Markdown (Informal)
[Filtering and Mining Parallel Data in a Joint Multilingual Space](https://aclanthology.org/P18-2037) (Schwenk, ACL 2018)
ACL