@inproceedings{lo-etal-2018-accurate,
title = "Accurate semantic textual similarity for cleaning noisy parallel corpora using semantic machine translation evaluation metric: The {NRC} supervised submissions to the Parallel Corpus Filtering task",
author = "Lo, Chi-kiu and
Simard, Michel and
Stewart, Darlene and
Larkin, Samuel and
Goutte, Cyril and
Littell, Patrick",
editor = "Bojar, Ond{\v{r}}ej and
Chatterjee, Rajen and
Federmann, Christian and
Fishel, Mark and
Graham, Yvette and
Haddow, Barry and
Huck, Matthias and
Yepes, Antonio Jimeno and
Koehn, Philipp and
Monz, Christof and
Negri, Matteo and
N{\'e}v{\'e}ol, Aur{\'e}lie and
Neves, Mariana and
Post, Matt and
Specia, Lucia and
Turchi, Marco and
Verspoor, Karin",
booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-6481",
doi = "10.18653/v1/W18-6481",
pages = "908--916",
abstract = "We present our semantic textual similarity approach in filtering a noisy web crawled parallel corpus using YiSi{---}a novel semantic machine translation evaluation metric. The systems mainly based on this supervised approach perform well in the WMT18 Parallel Corpus Filtering shared task (4th place in 100-million-word evaluation, 8th place in 10-million-word evaluation, and 6th place overall, out of 48 submissions). In fact, our best performing system{---}NRC-yisi-bicov is one of the only four submissions ranked top 10 in both evaluations. Our submitted systems also include some initial filtering steps for scaling down the size of the test corpus and a final redundancy removal step for better semantic and token coverage of the filtered corpus. In this paper, we also describe our unsuccessful attempt in automatically synthesizing a noisy parallel development corpus for tuning the weights to combine different parallelism and fluency features.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lo-etal-2018-accurate">
<titleInfo>
<title>Accurate semantic textual similarity for cleaning noisy parallel corpora using semantic machine translation evaluation metric: The NRC supervised submissions to the Parallel Corpus Filtering task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chi-kiu</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michel</namePart>
<namePart type="family">Simard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Darlene</namePart>
<namePart type="family">Stewart</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Larkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cyril</namePart>
<namePart type="family">Goutte</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Littell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Conference on Machine Translation: Shared Task Papers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Bojar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajen</namePart>
<namePart type="family">Chatterjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Federmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Fishel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthias</namePart>
<namePart type="family">Huck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="given">Jimeno</namePart>
<namePart type="family">Yepes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matteo</namePart>
<namePart type="family">Negri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aurélie</namePart>
<namePart type="family">Névéol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="family">Neves</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matt</namePart>
<namePart type="family">Post</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Turchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karin</namePart>
<namePart type="family">Verspoor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Belgium, Brussels</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present our semantic textual similarity approach in filtering a noisy web crawled parallel corpus using YiSi—a novel semantic machine translation evaluation metric. The systems mainly based on this supervised approach perform well in the WMT18 Parallel Corpus Filtering shared task (4th place in 100-million-word evaluation, 8th place in 10-million-word evaluation, and 6th place overall, out of 48 submissions). In fact, our best performing system—NRC-yisi-bicov is one of the only four submissions ranked top 10 in both evaluations. Our submitted systems also include some initial filtering steps for scaling down the size of the test corpus and a final redundancy removal step for better semantic and token coverage of the filtered corpus. In this paper, we also describe our unsuccessful attempt in automatically synthesizing a noisy parallel development corpus for tuning the weights to combine different parallelism and fluency features.</abstract>
<identifier type="citekey">lo-etal-2018-accurate</identifier>
<identifier type="doi">10.18653/v1/W18-6481</identifier>
<location>
<url>https://aclanthology.org/W18-6481</url>
</location>
<part>
<date>2018-10</date>
<extent unit="page">
<start>908</start>
<end>916</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Accurate semantic textual similarity for cleaning noisy parallel corpora using semantic machine translation evaluation metric: The NRC supervised submissions to the Parallel Corpus Filtering task
%A Lo, Chi-kiu
%A Simard, Michel
%A Stewart, Darlene
%A Larkin, Samuel
%A Goutte, Cyril
%A Littell, Patrick
%Y Bojar, Ondřej
%Y Chatterjee, Rajen
%Y Federmann, Christian
%Y Fishel, Mark
%Y Graham, Yvette
%Y Haddow, Barry
%Y Huck, Matthias
%Y Yepes, Antonio Jimeno
%Y Koehn, Philipp
%Y Monz, Christof
%Y Negri, Matteo
%Y Névéol, Aurélie
%Y Neves, Mariana
%Y Post, Matt
%Y Specia, Lucia
%Y Turchi, Marco
%Y Verspoor, Karin
%S Proceedings of the Third Conference on Machine Translation: Shared Task Papers
%D 2018
%8 October
%I Association for Computational Linguistics
%C Belgium, Brussels
%F lo-etal-2018-accurate
%X We present our semantic textual similarity approach in filtering a noisy web crawled parallel corpus using YiSi—a novel semantic machine translation evaluation metric. The systems mainly based on this supervised approach perform well in the WMT18 Parallel Corpus Filtering shared task (4th place in 100-million-word evaluation, 8th place in 10-million-word evaluation, and 6th place overall, out of 48 submissions). In fact, our best performing system—NRC-yisi-bicov is one of the only four submissions ranked top 10 in both evaluations. Our submitted systems also include some initial filtering steps for scaling down the size of the test corpus and a final redundancy removal step for better semantic and token coverage of the filtered corpus. In this paper, we also describe our unsuccessful attempt in automatically synthesizing a noisy parallel development corpus for tuning the weights to combine different parallelism and fluency features.
%R 10.18653/v1/W18-6481
%U https://aclanthology.org/W18-6481
%U https://doi.org/10.18653/v1/W18-6481
%P 908-916
Markdown (Informal)
[Accurate semantic textual similarity for cleaning noisy parallel corpora using semantic machine translation evaluation metric: The NRC supervised submissions to the Parallel Corpus Filtering task](https://aclanthology.org/W18-6481) (Lo et al., WMT 2018)
ACL