@inproceedings{nikolova-stoupak-etal-2022-filtering,
title = "Filtering of Noisy Web-Crawled Parallel Corpus: the {J}apanese-{B}ulgarian Language Pair",
author = "Nikolova-Stoupak, Iglika and
Shimizu, Shuichiro and
Chu, Chenhui and
Kurohashi, Sadao",
booktitle = "Proceedings of the Fifth International Conference on Computational Linguistics in Bulgaria (CLIB 2022)",
month = sep,
year = "2022",
address = "Sofia, Bulgaria",
publisher = "Department of Computational Linguistics, IBL -- BAS",
url = "https://aclanthology.org/2022.clib-1.4/",
pages = "39--48",
abstract = "One of the main challenges within the rapidly developing field of neural machine translation is its application to low-resource languages. Recent attempts to provide large parallel corpora in rare language pairs include the generation of web-crawled corpora, which may be vast but are, unfortunately, excessively noisy. The corpus utilised to train machine translation models in the study is CCMatrix, provided by OPUS. Firstly, the corpus is cleaned based on a number of heuristic rules. Then, parts of it are selected in three discrete ways: at random, based on the {\textquotedblleft}margin distance{\textquotedblright} metric that is native to the CCMatrix dataset, and based on scores derived through the application of a state-of-the-art classifier model (Acarcicek et al., 2020) utilised in a thematic WMT shared task. The performance of the issuing models is evaluated and compared. The classifier-based model does not reach high performance as compared with its margin-based counterpart, opening a discussion of ways for further improvement. Still, BLEU scores surpass those of Acarcicek et al.`s (2020) paper by over 15 points."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nikolova-stoupak-etal-2022-filtering">
<titleInfo>
<title>Filtering of Noisy Web-Crawled Parallel Corpus: the Japanese-Bulgarian Language Pair</title>
</titleInfo>
<name type="personal">
<namePart type="given">Iglika</namePart>
<namePart type="family">Nikolova-Stoupak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuichiro</namePart>
<namePart type="family">Shimizu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenhui</namePart>
<namePart type="family">Chu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sadao</namePart>
<namePart type="family">Kurohashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth International Conference on Computational Linguistics in Bulgaria (CLIB 2022)</title>
</titleInfo>
<originInfo>
<publisher>Department of Computational Linguistics, IBL – BAS</publisher>
<place>
<placeTerm type="text">Sofia, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>One of the main challenges within the rapidly developing field of neural machine translation is its application to low-resource languages. Recent attempts to provide large parallel corpora in rare language pairs include the generation of web-crawled corpora, which may be vast but are, unfortunately, excessively noisy. The corpus utilised to train machine translation models in the study is CCMatrix, provided by OPUS. Firstly, the corpus is cleaned based on a number of heuristic rules. Then, parts of it are selected in three discrete ways: at random, based on the “margin distance” metric that is native to the CCMatrix dataset, and based on scores derived through the application of a state-of-the-art classifier model (Acarcicek et al., 2020) utilised in a thematic WMT shared task. The performance of the issuing models is evaluated and compared. The classifier-based model does not reach high performance as compared with its margin-based counterpart, opening a discussion of ways for further improvement. Still, BLEU scores surpass those of Acarcicek et al.‘s (2020) paper by over 15 points.</abstract>
<identifier type="citekey">nikolova-stoupak-etal-2022-filtering</identifier>
<location>
<url>https://aclanthology.org/2022.clib-1.4/</url>
</location>
<part>
<date>2022-09</date>
<extent unit="page">
<start>39</start>
<end>48</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Filtering of Noisy Web-Crawled Parallel Corpus: the Japanese-Bulgarian Language Pair
%A Nikolova-Stoupak, Iglika
%A Shimizu, Shuichiro
%A Chu, Chenhui
%A Kurohashi, Sadao
%S Proceedings of the Fifth International Conference on Computational Linguistics in Bulgaria (CLIB 2022)
%D 2022
%8 September
%I Department of Computational Linguistics, IBL – BAS
%C Sofia, Bulgaria
%F nikolova-stoupak-etal-2022-filtering
%X One of the main challenges within the rapidly developing field of neural machine translation is its application to low-resource languages. Recent attempts to provide large parallel corpora in rare language pairs include the generation of web-crawled corpora, which may be vast but are, unfortunately, excessively noisy. The corpus utilised to train machine translation models in the study is CCMatrix, provided by OPUS. Firstly, the corpus is cleaned based on a number of heuristic rules. Then, parts of it are selected in three discrete ways: at random, based on the “margin distance” metric that is native to the CCMatrix dataset, and based on scores derived through the application of a state-of-the-art classifier model (Acarcicek et al., 2020) utilised in a thematic WMT shared task. The performance of the issuing models is evaluated and compared. The classifier-based model does not reach high performance as compared with its margin-based counterpart, opening a discussion of ways for further improvement. Still, BLEU scores surpass those of Acarcicek et al.‘s (2020) paper by over 15 points.
%U https://aclanthology.org/2022.clib-1.4/
%P 39-48
Markdown (Informal)
[Filtering of Noisy Web-Crawled Parallel Corpus: the Japanese-Bulgarian Language Pair](https://aclanthology.org/2022.clib-1.4/) (Nikolova-Stoupak et al., CLIB 2022)
ACL