@inproceedings{van-noord-etal-2025-quality,
title = "Quality Beyond A Glance: Revealing Large Quality Differences Between Web-Crawled Parallel Corpora",
author = "van Noord, Rik and
Espl{\`a}-Gomis, Miquel and
Chichirau, Malina and
Ram{\'i}rez-S{\'a}nchez, Gema and
Toral, Antonio",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.124/",
pages = "1824--1838",
abstract = "Parallel corpora play a vital role in advanced multilingual natural language processing tasks, notably in machine translation (MT). The recent emergence of numerous large parallel corpora, often extracted from multilingual documents on the Internet, has expanded the available resources. Nevertheless, the quality of these corpora remains largely unexplored, while there are large differences in how the corpora are constructed. Moreover, how the potential differences affect the performance of neural MT (NMT) systems has also received limited attention. This study addresses this gap by manually and automatically evaluating four well-known publicly available parallel corpora across eleven language pairs. Our findings are quite concerning: all corpora contain a substantial amount of noisy sentence pairs, with CCMatrix and CCAligned having well below of 50{\%} reasonably clean pairs. MaCoCu and ParaCrawl generally have higher quality texts, though around a third of the texts still have clear issues. While corpus size impacts NMT models' performance, our study highlights the critical role of quality: higher-quality corpora consistently yield better-performing NMT models when controlling for size."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="van-noord-etal-2025-quality">
<titleInfo>
<title>Quality Beyond A Glance: Revealing Large Quality Differences Between Web-Crawled Parallel Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rik</namePart>
<namePart type="family">van Noord</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miquel</namePart>
<namePart type="family">Esplà-Gomis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Malina</namePart>
<namePart type="family">Chichirau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gema</namePart>
<namePart type="family">Ramírez-Sánchez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Toral</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Parallel corpora play a vital role in advanced multilingual natural language processing tasks, notably in machine translation (MT). The recent emergence of numerous large parallel corpora, often extracted from multilingual documents on the Internet, has expanded the available resources. Nevertheless, the quality of these corpora remains largely unexplored, while there are large differences in how the corpora are constructed. Moreover, how the potential differences affect the performance of neural MT (NMT) systems has also received limited attention. This study addresses this gap by manually and automatically evaluating four well-known publicly available parallel corpora across eleven language pairs. Our findings are quite concerning: all corpora contain a substantial amount of noisy sentence pairs, with CCMatrix and CCAligned having well below of 50% reasonably clean pairs. MaCoCu and ParaCrawl generally have higher quality texts, though around a third of the texts still have clear issues. While corpus size impacts NMT models’ performance, our study highlights the critical role of quality: higher-quality corpora consistently yield better-performing NMT models when controlling for size.</abstract>
<identifier type="citekey">van-noord-etal-2025-quality</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.124/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>1824</start>
<end>1838</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Quality Beyond A Glance: Revealing Large Quality Differences Between Web-Crawled Parallel Corpora
%A van Noord, Rik
%A Esplà-Gomis, Miquel
%A Chichirau, Malina
%A Ramírez-Sánchez, Gema
%A Toral, Antonio
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F van-noord-etal-2025-quality
%X Parallel corpora play a vital role in advanced multilingual natural language processing tasks, notably in machine translation (MT). The recent emergence of numerous large parallel corpora, often extracted from multilingual documents on the Internet, has expanded the available resources. Nevertheless, the quality of these corpora remains largely unexplored, while there are large differences in how the corpora are constructed. Moreover, how the potential differences affect the performance of neural MT (NMT) systems has also received limited attention. This study addresses this gap by manually and automatically evaluating four well-known publicly available parallel corpora across eleven language pairs. Our findings are quite concerning: all corpora contain a substantial amount of noisy sentence pairs, with CCMatrix and CCAligned having well below of 50% reasonably clean pairs. MaCoCu and ParaCrawl generally have higher quality texts, though around a third of the texts still have clear issues. While corpus size impacts NMT models’ performance, our study highlights the critical role of quality: higher-quality corpora consistently yield better-performing NMT models when controlling for size.
%U https://aclanthology.org/2025.coling-main.124/
%P 1824-1838
Markdown (Informal)
[Quality Beyond A Glance: Revealing Large Quality Differences Between Web-Crawled Parallel Corpora](https://aclanthology.org/2025.coling-main.124/) (van Noord et al., COLING 2025)
ACL