@inproceedings{van-rosendaal-etal-2020-lower,
title = "Lower Bias, Higher Density Abusive Language Datasets: A Recipe",
author = "van Rosendaal, Juliet and
Caselli, Tommaso and
Nissim, Malvina",
editor = "Monti, Johanna and
Basile, Valerio and
Buono, Maria Pia Di and
Manna, Raffaele and
Pascucci, Antonio and
Tonelli, Sara",
booktitle = "Proceedings of the Workshop on Resources and Techniques for User and Author Profiling in Abusive Language",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/2020.restup-1.4",
pages = "14--19",
abstract = "Datasets to train models for abusive language detection are at the same time necessary and still scarce. One the reasons for their limited availability is the cost of their creation. It is not only that manual annotation is expensive, it is also the case that the phenomenon is sparse, causing human annotators having to go through a large number of irrelevant examples in order to obtain some significant data. Strategies used until now to increase density of abusive language and obtain more meaningful data overall, include data filtering on the basis of pre-selected keywords and hate-rich sources of data. We suggest a recipe that at the same time can provide meaningful data with possibly higher density of abusive language and also reduce top-down biases imposed by corpus creators in the selection of the data to annotate. More specifically, we exploit the controversy channel on Reddit to obtain keywords that are used to filter a Twitter dataset. While the method needs further validation and refinement, our preliminary experiments show a higher density of abusive tweets in the filtered vs unfiltered dataset, and a more meaningful topic distribution after filtering.",
language = "English",
ISBN = "979-10-95546-49-8",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="van-rosendaal-etal-2020-lower">
<titleInfo>
<title>Lower Bias, Higher Density Abusive Language Datasets: A Recipe</title>
</titleInfo>
<name type="personal">
<namePart type="given">Juliet</namePart>
<namePart type="family">van Rosendaal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tommaso</namePart>
<namePart type="family">Caselli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Malvina</namePart>
<namePart type="family">Nissim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Resources and Techniques for User and Author Profiling in Abusive Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Johanna</namePart>
<namePart type="family">Monti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valerio</namePart>
<namePart type="family">Basile</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Pia</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Buono</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raffaele</namePart>
<namePart type="family">Manna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Pascucci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Tonelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-49-8</identifier>
</relatedItem>
<abstract>Datasets to train models for abusive language detection are at the same time necessary and still scarce. One the reasons for their limited availability is the cost of their creation. It is not only that manual annotation is expensive, it is also the case that the phenomenon is sparse, causing human annotators having to go through a large number of irrelevant examples in order to obtain some significant data. Strategies used until now to increase density of abusive language and obtain more meaningful data overall, include data filtering on the basis of pre-selected keywords and hate-rich sources of data. We suggest a recipe that at the same time can provide meaningful data with possibly higher density of abusive language and also reduce top-down biases imposed by corpus creators in the selection of the data to annotate. More specifically, we exploit the controversy channel on Reddit to obtain keywords that are used to filter a Twitter dataset. While the method needs further validation and refinement, our preliminary experiments show a higher density of abusive tweets in the filtered vs unfiltered dataset, and a more meaningful topic distribution after filtering.</abstract>
<identifier type="citekey">van-rosendaal-etal-2020-lower</identifier>
<location>
<url>https://aclanthology.org/2020.restup-1.4</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>14</start>
<end>19</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Lower Bias, Higher Density Abusive Language Datasets: A Recipe
%A van Rosendaal, Juliet
%A Caselli, Tommaso
%A Nissim, Malvina
%Y Monti, Johanna
%Y Basile, Valerio
%Y Buono, Maria Pia Di
%Y Manna, Raffaele
%Y Pascucci, Antonio
%Y Tonelli, Sara
%S Proceedings of the Workshop on Resources and Techniques for User and Author Profiling in Abusive Language
%D 2020
%8 May
%I European Language Resources Association (ELRA)
%C Marseille, France
%@ 979-10-95546-49-8
%G English
%F van-rosendaal-etal-2020-lower
%X Datasets to train models for abusive language detection are at the same time necessary and still scarce. One the reasons for their limited availability is the cost of their creation. It is not only that manual annotation is expensive, it is also the case that the phenomenon is sparse, causing human annotators having to go through a large number of irrelevant examples in order to obtain some significant data. Strategies used until now to increase density of abusive language and obtain more meaningful data overall, include data filtering on the basis of pre-selected keywords and hate-rich sources of data. We suggest a recipe that at the same time can provide meaningful data with possibly higher density of abusive language and also reduce top-down biases imposed by corpus creators in the selection of the data to annotate. More specifically, we exploit the controversy channel on Reddit to obtain keywords that are used to filter a Twitter dataset. While the method needs further validation and refinement, our preliminary experiments show a higher density of abusive tweets in the filtered vs unfiltered dataset, and a more meaningful topic distribution after filtering.
%U https://aclanthology.org/2020.restup-1.4
%P 14-19
Markdown (Informal)
[Lower Bias, Higher Density Abusive Language Datasets: A Recipe](https://aclanthology.org/2020.restup-1.4) (van Rosendaal et al., ResTUP 2020)
ACL
- Juliet van Rosendaal, Tommaso Caselli, and Malvina Nissim. 2020. Lower Bias, Higher Density Abusive Language Datasets: A Recipe. In Proceedings of the Workshop on Resources and Techniques for User and Author Profiling in Abusive Language, pages 14–19, Marseille, France. European Language Resources Association (ELRA).