@inproceedings{aulamo-etal-2020-opusfilter,
title = "{O}pus{F}ilter: A Configurable Parallel Corpus Filtering Toolbox",
author = {Aulamo, Mikko and
Virpioja, Sami and
Tiedemann, J{\"o}rg},
editor = "Celikyilmaz, Asli and
Wen, Tsung-Hsien",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-demos.20",
doi = "10.18653/v1/2020.acl-demos.20",
pages = "150--156",
abstract = "This paper introduces OpusFilter, a flexible and modular toolbox for filtering parallel corpora. It implements a number of components based on heuristic filters, language identification libraries, character-based language models, and word alignment tools, and it can easily be extended with custom filters. Bitext segments can be ranked according to their quality or domain match using single features or a logistic regression model that can be trained without manually labeled training data. We demonstrate the effectiveness of OpusFilter on the example of a Finnish-English news translation task based on noisy web-crawled training data. Applying our tool leads to improved translation quality while significantly reducing the size of the training data, also clearly outperforming an alternative ranking given in the crawled data set. Furthermore, we show the ability of OpusFilter to perform data selection for domain adaptation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="aulamo-etal-2020-opusfilter">
<titleInfo>
<title>OpusFilter: A Configurable Parallel Corpus Filtering Toolbox</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mikko</namePart>
<namePart type="family">Aulamo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sami</namePart>
<namePart type="family">Virpioja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Asli</namePart>
<namePart type="family">Celikyilmaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tsung-Hsien</namePart>
<namePart type="family">Wen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper introduces OpusFilter, a flexible and modular toolbox for filtering parallel corpora. It implements a number of components based on heuristic filters, language identification libraries, character-based language models, and word alignment tools, and it can easily be extended with custom filters. Bitext segments can be ranked according to their quality or domain match using single features or a logistic regression model that can be trained without manually labeled training data. We demonstrate the effectiveness of OpusFilter on the example of a Finnish-English news translation task based on noisy web-crawled training data. Applying our tool leads to improved translation quality while significantly reducing the size of the training data, also clearly outperforming an alternative ranking given in the crawled data set. Furthermore, we show the ability of OpusFilter to perform data selection for domain adaptation.</abstract>
<identifier type="citekey">aulamo-etal-2020-opusfilter</identifier>
<identifier type="doi">10.18653/v1/2020.acl-demos.20</identifier>
<location>
<url>https://aclanthology.org/2020.acl-demos.20</url>
</location>
<part>
<date>2020-07</date>
<extent unit="page">
<start>150</start>
<end>156</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T OpusFilter: A Configurable Parallel Corpus Filtering Toolbox
%A Aulamo, Mikko
%A Virpioja, Sami
%A Tiedemann, Jörg
%Y Celikyilmaz, Asli
%Y Wen, Tsung-Hsien
%S Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations
%D 2020
%8 July
%I Association for Computational Linguistics
%C Online
%F aulamo-etal-2020-opusfilter
%X This paper introduces OpusFilter, a flexible and modular toolbox for filtering parallel corpora. It implements a number of components based on heuristic filters, language identification libraries, character-based language models, and word alignment tools, and it can easily be extended with custom filters. Bitext segments can be ranked according to their quality or domain match using single features or a logistic regression model that can be trained without manually labeled training data. We demonstrate the effectiveness of OpusFilter on the example of a Finnish-English news translation task based on noisy web-crawled training data. Applying our tool leads to improved translation quality while significantly reducing the size of the training data, also clearly outperforming an alternative ranking given in the crawled data set. Furthermore, we show the ability of OpusFilter to perform data selection for domain adaptation.
%R 10.18653/v1/2020.acl-demos.20
%U https://aclanthology.org/2020.acl-demos.20
%U https://doi.org/10.18653/v1/2020.acl-demos.20
%P 150-156
Markdown (Informal)
[OpusFilter: A Configurable Parallel Corpus Filtering Toolbox](https://aclanthology.org/2020.acl-demos.20) (Aulamo et al., ACL 2020)
ACL