@inproceedings{song-etal-2021-large,
title = "A Large-scale Comprehensive Abusiveness Detection Dataset with Multifaceted Labels from {R}eddit",
author = "Song, Hoyun and
Ryu, Soo Hyun and
Lee, Huije and
Park, Jong",
editor = "Bisazza, Arianna and
Abend, Omri",
booktitle = "Proceedings of the 25th Conference on Computational Natural Language Learning",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.conll-1.43",
doi = "10.18653/v1/2021.conll-1.43",
pages = "552--561",
abstract = "As users in online communities suffer from severe side effects of abusive language, many researchers attempted to detect abusive texts from social media, presenting several datasets for such detection. However, none of them contain both comprehensive labels and contextual information, which are essential for thoroughly detecting all kinds of abusiveness from texts, since datasets with such fine-grained features demand a significant amount of annotations, leading to much increased complexity. In this paper, we propose a Comprehensive Abusiveness Detection Dataset (CADD), collected from the English Reddit posts, with multifaceted labels and contexts. Our dataset is annotated hierarchically for an efficient annotation through crowdsourcing on a large-scale. We also empirically explore the characteristics of our dataset and provide a detailed analysis for novel insights. The results of our experiments with strong pre-trained natural language understanding models on our dataset show that our dataset gives rise to meaningful performance, assuring its practicality for abusive language detection.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="song-etal-2021-large">
<titleInfo>
<title>A Large-scale Comprehensive Abusiveness Detection Dataset with Multifaceted Labels from Reddit</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hoyun</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soo</namePart>
<namePart type="given">Hyun</namePart>
<namePart type="family">Ryu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huije</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jong</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 25th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arianna</namePart>
<namePart type="family">Bisazza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omri</namePart>
<namePart type="family">Abend</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>As users in online communities suffer from severe side effects of abusive language, many researchers attempted to detect abusive texts from social media, presenting several datasets for such detection. However, none of them contain both comprehensive labels and contextual information, which are essential for thoroughly detecting all kinds of abusiveness from texts, since datasets with such fine-grained features demand a significant amount of annotations, leading to much increased complexity. In this paper, we propose a Comprehensive Abusiveness Detection Dataset (CADD), collected from the English Reddit posts, with multifaceted labels and contexts. Our dataset is annotated hierarchically for an efficient annotation through crowdsourcing on a large-scale. We also empirically explore the characteristics of our dataset and provide a detailed analysis for novel insights. The results of our experiments with strong pre-trained natural language understanding models on our dataset show that our dataset gives rise to meaningful performance, assuring its practicality for abusive language detection.</abstract>
<identifier type="citekey">song-etal-2021-large</identifier>
<identifier type="doi">10.18653/v1/2021.conll-1.43</identifier>
<location>
<url>https://aclanthology.org/2021.conll-1.43</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>552</start>
<end>561</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Large-scale Comprehensive Abusiveness Detection Dataset with Multifaceted Labels from Reddit
%A Song, Hoyun
%A Ryu, Soo Hyun
%A Lee, Huije
%A Park, Jong
%Y Bisazza, Arianna
%Y Abend, Omri
%S Proceedings of the 25th Conference on Computational Natural Language Learning
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online
%F song-etal-2021-large
%X As users in online communities suffer from severe side effects of abusive language, many researchers attempted to detect abusive texts from social media, presenting several datasets for such detection. However, none of them contain both comprehensive labels and contextual information, which are essential for thoroughly detecting all kinds of abusiveness from texts, since datasets with such fine-grained features demand a significant amount of annotations, leading to much increased complexity. In this paper, we propose a Comprehensive Abusiveness Detection Dataset (CADD), collected from the English Reddit posts, with multifaceted labels and contexts. Our dataset is annotated hierarchically for an efficient annotation through crowdsourcing on a large-scale. We also empirically explore the characteristics of our dataset and provide a detailed analysis for novel insights. The results of our experiments with strong pre-trained natural language understanding models on our dataset show that our dataset gives rise to meaningful performance, assuring its practicality for abusive language detection.
%R 10.18653/v1/2021.conll-1.43
%U https://aclanthology.org/2021.conll-1.43
%U https://doi.org/10.18653/v1/2021.conll-1.43
%P 552-561
Markdown (Informal)
[A Large-scale Comprehensive Abusiveness Detection Dataset with Multifaceted Labels from Reddit](https://aclanthology.org/2021.conll-1.43) (Song et al., CoNLL 2021)
ACL