@inproceedings{luccioni-viviano-2021-whats,
title = "What{'}s in the Box? An Analysis of Undesirable Content in the {C}ommon {C}rawl Corpus",
author = "Luccioni, Alexandra and
Viviano, Joseph",
editor = "Zong, Chengqing and
Xia, Fei and
Li, Wenjie and
Navigli, Roberto",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.acl-short.24",
doi = "10.18653/v1/2021.acl-short.24",
pages = "182--189",
abstract = "Whereas much of the success of the current generation of neural language models has been driven by increasingly large training corpora, relatively little research has been dedicated to analyzing these massive sources of textual data. In this exploratory analysis, we delve deeper into the Common Crawl, a colossal web corpus that is extensively used for training language models. We find that it contains a significant amount of undesirable content, including hate speech and sexually explicit content, even after filtering procedures. We discuss the potential impacts of this content on language models and conclude with future research directions and a more mindful approach to corpus collection and analysis.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="luccioni-viviano-2021-whats">
<titleInfo>
<title>What’s in the Box? An Analysis of Undesirable Content in the Common Crawl Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alexandra</namePart>
<namePart type="family">Luccioni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Viviano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenjie</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Navigli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Whereas much of the success of the current generation of neural language models has been driven by increasingly large training corpora, relatively little research has been dedicated to analyzing these massive sources of textual data. In this exploratory analysis, we delve deeper into the Common Crawl, a colossal web corpus that is extensively used for training language models. We find that it contains a significant amount of undesirable content, including hate speech and sexually explicit content, even after filtering procedures. We discuss the potential impacts of this content on language models and conclude with future research directions and a more mindful approach to corpus collection and analysis.</abstract>
<identifier type="citekey">luccioni-viviano-2021-whats</identifier>
<identifier type="doi">10.18653/v1/2021.acl-short.24</identifier>
<location>
<url>https://aclanthology.org/2021.acl-short.24</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>182</start>
<end>189</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T What’s in the Box? An Analysis of Undesirable Content in the Common Crawl Corpus
%A Luccioni, Alexandra
%A Viviano, Joseph
%Y Zong, Chengqing
%Y Xia, Fei
%Y Li, Wenjie
%Y Navigli, Roberto
%S Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F luccioni-viviano-2021-whats
%X Whereas much of the success of the current generation of neural language models has been driven by increasingly large training corpora, relatively little research has been dedicated to analyzing these massive sources of textual data. In this exploratory analysis, we delve deeper into the Common Crawl, a colossal web corpus that is extensively used for training language models. We find that it contains a significant amount of undesirable content, including hate speech and sexually explicit content, even after filtering procedures. We discuss the potential impacts of this content on language models and conclude with future research directions and a more mindful approach to corpus collection and analysis.
%R 10.18653/v1/2021.acl-short.24
%U https://aclanthology.org/2021.acl-short.24
%U https://doi.org/10.18653/v1/2021.acl-short.24
%P 182-189
Markdown (Informal)
[What’s in the Box? An Analysis of Undesirable Content in the Common Crawl Corpus](https://aclanthology.org/2021.acl-short.24) (Luccioni & Viviano, ACL-IJCNLP 2021)
ACL