@inproceedings{koehn-2024-neural,
title = "Neural Methods for Aligning Large-Scale Parallel Corpora from the Web for South and {E}ast {A}sian Languages",
author = "Koehn, Philipp",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Ninth Conference on Machine Translation",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wmt-1.132",
pages = "1454--1466",
abstract = "We introduce neural methods and a toxicity filtering step to the hierarchical web mining approach of Paracrawl (Ba{\~n}{\'o}n et al., 2020), showing large improvements. We apply these methods to web-scale parallel corpus mining for 9 South and East Asian national languages, creating training resources for machine translation that yield better translation quality for most of these languages than existing publicly available datasets in OPUS. Our methods also generally lead to better results than the global mining approach of Schwenk et al. (2021).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="koehn-2024-neural">
<titleInfo>
<title>Neural Methods for Aligning Large-Scale Parallel Corpora from the Web for South and East Asian Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce neural methods and a toxicity filtering step to the hierarchical web mining approach of Paracrawl (Bañón et al., 2020), showing large improvements. We apply these methods to web-scale parallel corpus mining for 9 South and East Asian national languages, creating training resources for machine translation that yield better translation quality for most of these languages than existing publicly available datasets in OPUS. Our methods also generally lead to better results than the global mining approach of Schwenk et al. (2021).</abstract>
<identifier type="citekey">koehn-2024-neural</identifier>
<location>
<url>https://aclanthology.org/2024.wmt-1.132</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>1454</start>
<end>1466</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Neural Methods for Aligning Large-Scale Parallel Corpora from the Web for South and East Asian Languages
%A Koehn, Philipp
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Koehn, Philipp
%Y Monz, Christof
%S Proceedings of the Ninth Conference on Machine Translation
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F koehn-2024-neural
%X We introduce neural methods and a toxicity filtering step to the hierarchical web mining approach of Paracrawl (Bañón et al., 2020), showing large improvements. We apply these methods to web-scale parallel corpus mining for 9 South and East Asian national languages, creating training resources for machine translation that yield better translation quality for most of these languages than existing publicly available datasets in OPUS. Our methods also generally lead to better results than the global mining approach of Schwenk et al. (2021).
%U https://aclanthology.org/2024.wmt-1.132
%P 1454-1466
Markdown (Informal)
[Neural Methods for Aligning Large-Scale Parallel Corpora from the Web for South and East Asian Languages](https://aclanthology.org/2024.wmt-1.132) (Koehn, WMT 2024)
ACL