@inproceedings{L16-1286,
 abstract = {We introduce a framework for quality assurance of corpora, and apply it to the Reuters Multilingual Corpus (RCV2). The results of this quality assessment of this standard newsprint corpus reveal a significant duplication problem and, to a lesser extent, a problem with corrupted articles. From the raw collection of some 487,000 articles, almost one tenth are trivial duplicates. A smaller fraction of articles appear to be corrupted and should be excluded for that reason. The detailed results are being made available as on-line appendices to this article. This effort also demonstrates the beginnings of a constraint-based methodological framework for quality assessment and quality assurance for corpora. As a first implementation of this framework, we have investigated constraints to verify sample integrity, and to diagnose sample duplication, entropy aberrations, and tagging inconsistencies. To help identify near-duplicates in the corpus, we have employed both entropy measurements and a simple byte bigram incidence digest.
},
 address = {Portorož, Slovenia},
 author = {Robin Eriksson},
 booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
 month = {May},
 pages = {1813--1819},
 publisher = {European Language Resources Association (ELRA)},
 title = {Quality Assessment of the Reuters Vol. 2 Multilingual Corpus},
 url = {https://www.aclweb.org/anthology/L16-1286},
 year = {2016}
}

