@inproceedings{fortuna-etal-2019-hierarchically,
title = "A Hierarchically-Labeled {P}ortuguese Hate Speech Dataset",
author = "Fortuna, Paula and
Rocha da Silva, Jo{\~a}o and
Soler-Company, Juan and
Wanner, Leo and
Nunes, S{\'e}rgio",
editor = "Roberts, Sarah T. and
Tetreault, Joel and
Prabhakaran, Vinodkumar and
Waseem, Zeerak",
booktitle = "Proceedings of the Third Workshop on Abusive Language Online",
month = aug,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-3510",
doi = "10.18653/v1/W19-3510",
pages = "94--104",
abstract = "Over the past years, the amount of online offensive speech has been growing steadily. To successfully cope with it, machine learning are applied. However, ML-based techniques require sufficiently large annotated datasets. In the last years, different datasets were published, mainly for English. In this paper, we present a new dataset for Portuguese, which has not been in focus so far. The dataset is composed of 5,668 tweets. For its annotation, we defined two different schemes used by annotators with different levels of expertise. Firstly, non-experts annotated the tweets with binary labels ({`}hate{'} vs. {`}no-hate{'}). Secondly, expert annotators classified the tweets following a fine-grained hierarchical multiple label scheme with 81 hate speech categories in total. The inter-annotator agreement varied from category to category, which reflects the insight that some types of hate speech are more subtle than others and that their detection depends on personal perception. This hierarchical annotation scheme is the main contribution of the presented work, as it facilitates the identification of different types of hate speech and their intersections. To demonstrate the usefulness of our dataset, we carried a baseline classification experiment with pre-trained word embeddings and LSTM on the binary classified data, with a state-of-the-art outcome.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fortuna-etal-2019-hierarchically">
<titleInfo>
<title>A Hierarchically-Labeled Portuguese Hate Speech Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Paula</namePart>
<namePart type="family">Fortuna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Rocha da Silva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Soler-Company</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sérgio</namePart>
<namePart type="family">Nunes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Abusive Language Online</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sarah</namePart>
<namePart type="given">T</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joel</namePart>
<namePart type="family">Tetreault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vinodkumar</namePart>
<namePart type="family">Prabhakaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeerak</namePart>
<namePart type="family">Waseem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Over the past years, the amount of online offensive speech has been growing steadily. To successfully cope with it, machine learning are applied. However, ML-based techniques require sufficiently large annotated datasets. In the last years, different datasets were published, mainly for English. In this paper, we present a new dataset for Portuguese, which has not been in focus so far. The dataset is composed of 5,668 tweets. For its annotation, we defined two different schemes used by annotators with different levels of expertise. Firstly, non-experts annotated the tweets with binary labels (‘hate’ vs. ‘no-hate’). Secondly, expert annotators classified the tweets following a fine-grained hierarchical multiple label scheme with 81 hate speech categories in total. The inter-annotator agreement varied from category to category, which reflects the insight that some types of hate speech are more subtle than others and that their detection depends on personal perception. This hierarchical annotation scheme is the main contribution of the presented work, as it facilitates the identification of different types of hate speech and their intersections. To demonstrate the usefulness of our dataset, we carried a baseline classification experiment with pre-trained word embeddings and LSTM on the binary classified data, with a state-of-the-art outcome.</abstract>
<identifier type="citekey">fortuna-etal-2019-hierarchically</identifier>
<identifier type="doi">10.18653/v1/W19-3510</identifier>
<location>
<url>https://aclanthology.org/W19-3510</url>
</location>
<part>
<date>2019-08</date>
<extent unit="page">
<start>94</start>
<end>104</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Hierarchically-Labeled Portuguese Hate Speech Dataset
%A Fortuna, Paula
%A Rocha da Silva, João
%A Soler-Company, Juan
%A Wanner, Leo
%A Nunes, Sérgio
%Y Roberts, Sarah T.
%Y Tetreault, Joel
%Y Prabhakaran, Vinodkumar
%Y Waseem, Zeerak
%S Proceedings of the Third Workshop on Abusive Language Online
%D 2019
%8 August
%I Association for Computational Linguistics
%C Florence, Italy
%F fortuna-etal-2019-hierarchically
%X Over the past years, the amount of online offensive speech has been growing steadily. To successfully cope with it, machine learning are applied. However, ML-based techniques require sufficiently large annotated datasets. In the last years, different datasets were published, mainly for English. In this paper, we present a new dataset for Portuguese, which has not been in focus so far. The dataset is composed of 5,668 tweets. For its annotation, we defined two different schemes used by annotators with different levels of expertise. Firstly, non-experts annotated the tweets with binary labels (‘hate’ vs. ‘no-hate’). Secondly, expert annotators classified the tweets following a fine-grained hierarchical multiple label scheme with 81 hate speech categories in total. The inter-annotator agreement varied from category to category, which reflects the insight that some types of hate speech are more subtle than others and that their detection depends on personal perception. This hierarchical annotation scheme is the main contribution of the presented work, as it facilitates the identification of different types of hate speech and their intersections. To demonstrate the usefulness of our dataset, we carried a baseline classification experiment with pre-trained word embeddings and LSTM on the binary classified data, with a state-of-the-art outcome.
%R 10.18653/v1/W19-3510
%U https://aclanthology.org/W19-3510
%U https://doi.org/10.18653/v1/W19-3510
%P 94-104
Markdown (Informal)
[A Hierarchically-Labeled Portuguese Hate Speech Dataset](https://aclanthology.org/W19-3510) (Fortuna et al., ALW 2019)
ACL
- Paula Fortuna, João Rocha da Silva, Juan Soler-Company, Leo Wanner, and Sérgio Nunes. 2019. A Hierarchically-Labeled Portuguese Hate Speech Dataset. In Proceedings of the Third Workshop on Abusive Language Online, pages 94–104, Florence, Italy. Association for Computational Linguistics.