@inproceedings{bane-etal-2022-comparison,
title = "A Comparison of Data Filtering Methods for Neural Machine Translation",
author = "Bane, Fred and
Uguet, Celia Soler and
Stribi{\.z}ew, Wiktor and
Zaretskaya, Anna",
editor = "Campbell, Janice and
Larocca, Stephen and
Marciano, Jay and
Savenkov, Konstantin and
Yanishevsky, Alex",
booktitle = "Proceedings of the 15th Biennial Conference of the Association for Machine Translation in the Americas (Volume 2: Users and Providers Track and Government Track)",
month = sep,
year = "2022",
address = "Orlando, USA",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2022.amta-upg.22",
pages = "313--325",
abstract = "With the increasing availability of large-scale parallel corpora derived from web crawling and bilingual text mining, data filtering is becoming an increasingly important step in neural machine translation (NMT) pipelines. This paper applies several available tools to the task of data filtration, and compares their performance in filtering out different types of noisy data. We also study the effect of filtration with each tool on model performance in the downstream task of NMT by creating a dataset containing a combination of clean and noisy data, filtering the data with each tool, and training NMT engines using the resulting filtered corpora. We evaluate the performance of each engine with a combination of direct assessment (DA) and automated metrics. Our best results are obtained by training for a short time on all available data then filtering the corpus with cross-entropy filtering and training until convergence.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bane-etal-2022-comparison">
<titleInfo>
<title>A Comparison of Data Filtering Methods for Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fred</namePart>
<namePart type="family">Bane</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Celia</namePart>
<namePart type="given">Soler</namePart>
<namePart type="family">Uguet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wiktor</namePart>
<namePart type="family">Stribiżew</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Zaretskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th Biennial Conference of the Association for Machine Translation in the Americas (Volume 2: Users and Providers Track and Government Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Janice</namePart>
<namePart type="family">Campbell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stephen</namePart>
<namePart type="family">Larocca</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jay</namePart>
<namePart type="family">Marciano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Konstantin</namePart>
<namePart type="family">Savenkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Yanishevsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">Orlando, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With the increasing availability of large-scale parallel corpora derived from web crawling and bilingual text mining, data filtering is becoming an increasingly important step in neural machine translation (NMT) pipelines. This paper applies several available tools to the task of data filtration, and compares their performance in filtering out different types of noisy data. We also study the effect of filtration with each tool on model performance in the downstream task of NMT by creating a dataset containing a combination of clean and noisy data, filtering the data with each tool, and training NMT engines using the resulting filtered corpora. We evaluate the performance of each engine with a combination of direct assessment (DA) and automated metrics. Our best results are obtained by training for a short time on all available data then filtering the corpus with cross-entropy filtering and training until convergence.</abstract>
<identifier type="citekey">bane-etal-2022-comparison</identifier>
<location>
<url>https://aclanthology.org/2022.amta-upg.22</url>
</location>
<part>
<date>2022-09</date>
<extent unit="page">
<start>313</start>
<end>325</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Comparison of Data Filtering Methods for Neural Machine Translation
%A Bane, Fred
%A Uguet, Celia Soler
%A Stribiżew, Wiktor
%A Zaretskaya, Anna
%Y Campbell, Janice
%Y Larocca, Stephen
%Y Marciano, Jay
%Y Savenkov, Konstantin
%Y Yanishevsky, Alex
%S Proceedings of the 15th Biennial Conference of the Association for Machine Translation in the Americas (Volume 2: Users and Providers Track and Government Track)
%D 2022
%8 September
%I Association for Machine Translation in the Americas
%C Orlando, USA
%F bane-etal-2022-comparison
%X With the increasing availability of large-scale parallel corpora derived from web crawling and bilingual text mining, data filtering is becoming an increasingly important step in neural machine translation (NMT) pipelines. This paper applies several available tools to the task of data filtration, and compares their performance in filtering out different types of noisy data. We also study the effect of filtration with each tool on model performance in the downstream task of NMT by creating a dataset containing a combination of clean and noisy data, filtering the data with each tool, and training NMT engines using the resulting filtered corpora. We evaluate the performance of each engine with a combination of direct assessment (DA) and automated metrics. Our best results are obtained by training for a short time on all available data then filtering the corpus with cross-entropy filtering and training until convergence.
%U https://aclanthology.org/2022.amta-upg.22
%P 313-325
Markdown (Informal)
[A Comparison of Data Filtering Methods for Neural Machine Translation](https://aclanthology.org/2022.amta-upg.22) (Bane et al., AMTA 2022)
ACL
- Fred Bane, Celia Soler Uguet, Wiktor Stribiżew, and Anna Zaretskaya. 2022. A Comparison of Data Filtering Methods for Neural Machine Translation. In Proceedings of the 15th Biennial Conference of the Association for Machine Translation in the Americas (Volume 2: Users and Providers Track and Government Track), pages 313–325, Orlando, USA. Association for Machine Translation in the Americas.