@inproceedings{bane-zaretskaya-2021-selecting,
title = "Selecting the best data filtering method for {NMT} training",
author = "Bane, Fred and
Zaretskaya, Anna",
editor = "Campbell, Janice and
Huyck, Ben and
Larocca, Stephen and
Marciano, Jay and
Savenkov, Konstantin and
Yanishevsky, Alex",
booktitle = "Proceedings of Machine Translation Summit XVIII: Users and Providers Track",
month = aug,
year = "2021",
address = "Virtual",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2021.mtsummit-up.9",
pages = "89--97",
abstract = "Performance of NMT systems has been proven to depend on the quality of the training data. In this paper we explore different open-source tools that can be used to score the quality of translation pairs, with the goal of obtaining clean corpora for training NMT models. We measure the performance of these tools by correlating their scores with human scores, as well as rank models trained on the resulting filtered datasets in terms of their performance on different test sets and MT performance metrics.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bane-zaretskaya-2021-selecting">
<titleInfo>
<title>Selecting the best data filtering method for NMT training</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fred</namePart>
<namePart type="family">Bane</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Zaretskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Machine Translation Summit XVIII: Users and Providers Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Janice</namePart>
<namePart type="family">Campbell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ben</namePart>
<namePart type="family">Huyck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stephen</namePart>
<namePart type="family">Larocca</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jay</namePart>
<namePart type="family">Marciano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Konstantin</namePart>
<namePart type="family">Savenkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Yanishevsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">Virtual</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Performance of NMT systems has been proven to depend on the quality of the training data. In this paper we explore different open-source tools that can be used to score the quality of translation pairs, with the goal of obtaining clean corpora for training NMT models. We measure the performance of these tools by correlating their scores with human scores, as well as rank models trained on the resulting filtered datasets in terms of their performance on different test sets and MT performance metrics.</abstract>
<identifier type="citekey">bane-zaretskaya-2021-selecting</identifier>
<location>
<url>https://aclanthology.org/2021.mtsummit-up.9</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>89</start>
<end>97</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Selecting the best data filtering method for NMT training
%A Bane, Fred
%A Zaretskaya, Anna
%Y Campbell, Janice
%Y Huyck, Ben
%Y Larocca, Stephen
%Y Marciano, Jay
%Y Savenkov, Konstantin
%Y Yanishevsky, Alex
%S Proceedings of Machine Translation Summit XVIII: Users and Providers Track
%D 2021
%8 August
%I Association for Machine Translation in the Americas
%C Virtual
%F bane-zaretskaya-2021-selecting
%X Performance of NMT systems has been proven to depend on the quality of the training data. In this paper we explore different open-source tools that can be used to score the quality of translation pairs, with the goal of obtaining clean corpora for training NMT models. We measure the performance of these tools by correlating their scores with human scores, as well as rank models trained on the resulting filtered datasets in terms of their performance on different test sets and MT performance metrics.
%U https://aclanthology.org/2021.mtsummit-up.9
%P 89-97
Markdown (Informal)
[Selecting the best data filtering method for NMT training](https://aclanthology.org/2021.mtsummit-up.9) (Bane & Zaretskaya, MTSummit 2021)
ACL