@inproceedings{steingrimsson-2023-sentence,
title = "A Sentence Alignment Approach to Document Alignment and Multi-faceted Filtering for Curating Parallel Sentence Pairs from Web-crawled Data",
author = "Steingrimsson, Steinthor",
editor = "Koehn, Philipp and
Haddow, Barry and
Kocmi, Tom and
Monz, Christof",
booktitle = "Proceedings of the Eighth Conference on Machine Translation",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.wmt-1.38",
doi = "10.18653/v1/2023.wmt-1.38",
pages = "366--374",
abstract = "This paper describes the AST submission to the WMT23 Shared Task on Parallel Data Curation. We experiment with two approaches for curating data from the provided web-scraped texts. We use sentence alignment to identify document alignments in the data and extract parallel sentence pairs from the aligned documents. All other sentences, not aligned in that step, are paired based on cosine similarity before we apply various different filters. For filtering, we use language detection, fluency classification, word alignments, cosine distance as calculated by multilingual sentence embedding models, and Bicleaner AI. Our best model outperforms the baseline by 1.9 BLEU points on average over the four provided evaluation sets.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="steingrimsson-2023-sentence">
<titleInfo>
<title>A Sentence Alignment Approach to Document Alignment and Multi-faceted Filtering for Curating Parallel Sentence Pairs from Web-crawled Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Steinthor</namePart>
<namePart type="family">Steingrimsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes the AST submission to the WMT23 Shared Task on Parallel Data Curation. We experiment with two approaches for curating data from the provided web-scraped texts. We use sentence alignment to identify document alignments in the data and extract parallel sentence pairs from the aligned documents. All other sentences, not aligned in that step, are paired based on cosine similarity before we apply various different filters. For filtering, we use language detection, fluency classification, word alignments, cosine distance as calculated by multilingual sentence embedding models, and Bicleaner AI. Our best model outperforms the baseline by 1.9 BLEU points on average over the four provided evaluation sets.</abstract>
<identifier type="citekey">steingrimsson-2023-sentence</identifier>
<identifier type="doi">10.18653/v1/2023.wmt-1.38</identifier>
<location>
<url>https://aclanthology.org/2023.wmt-1.38</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>366</start>
<end>374</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Sentence Alignment Approach to Document Alignment and Multi-faceted Filtering for Curating Parallel Sentence Pairs from Web-crawled Data
%A Steingrimsson, Steinthor
%Y Koehn, Philipp
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Monz, Christof
%S Proceedings of the Eighth Conference on Machine Translation
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F steingrimsson-2023-sentence
%X This paper describes the AST submission to the WMT23 Shared Task on Parallel Data Curation. We experiment with two approaches for curating data from the provided web-scraped texts. We use sentence alignment to identify document alignments in the data and extract parallel sentence pairs from the aligned documents. All other sentences, not aligned in that step, are paired based on cosine similarity before we apply various different filters. For filtering, we use language detection, fluency classification, word alignments, cosine distance as calculated by multilingual sentence embedding models, and Bicleaner AI. Our best model outperforms the baseline by 1.9 BLEU points on average over the four provided evaluation sets.
%R 10.18653/v1/2023.wmt-1.38
%U https://aclanthology.org/2023.wmt-1.38
%U https://doi.org/10.18653/v1/2023.wmt-1.38
%P 366-374
Markdown (Informal)
[A Sentence Alignment Approach to Document Alignment and Multi-faceted Filtering for Curating Parallel Sentence Pairs from Web-crawled Data](https://aclanthology.org/2023.wmt-1.38) (Steingrimsson, WMT 2023)
ACL