@inproceedings{ilan-vilenchik-2022-harald,
title = "{HARALD}: Augmenting Hate Speech Data Sets with Real Data",
author = "Ilan, Tal and
Vilenchik, Dan",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.165",
doi = "10.18653/v1/2022.findings-emnlp.165",
pages = "2241--2248",
abstract = "The successful completion of the hate speech detection task hinges upon the availability of rich and variable labeled data, which is hard to obtain. In this work, we present a new approach for data augmentation that uses as input real unlabelled data, which is carefully selected from online platforms where invited hate speech is abundant. We show that by harvesting and processing this data (in an automatic manner), one can augment existing manually-labeled datasets to improve the classification performance of hate speech classification models. We observed an improvement in F1-score ranging from 2.7{\%} and up to 9.5{\%}, depending on the task (in- or cross-domain) and the model used.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ilan-vilenchik-2022-harald">
<titleInfo>
<title>HARALD: Augmenting Hate Speech Data Sets with Real Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tal</namePart>
<namePart type="family">Ilan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Vilenchik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The successful completion of the hate speech detection task hinges upon the availability of rich and variable labeled data, which is hard to obtain. In this work, we present a new approach for data augmentation that uses as input real unlabelled data, which is carefully selected from online platforms where invited hate speech is abundant. We show that by harvesting and processing this data (in an automatic manner), one can augment existing manually-labeled datasets to improve the classification performance of hate speech classification models. We observed an improvement in F1-score ranging from 2.7% and up to 9.5%, depending on the task (in- or cross-domain) and the model used.</abstract>
<identifier type="citekey">ilan-vilenchik-2022-harald</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.165</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.165</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>2241</start>
<end>2248</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HARALD: Augmenting Hate Speech Data Sets with Real Data
%A Ilan, Tal
%A Vilenchik, Dan
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F ilan-vilenchik-2022-harald
%X The successful completion of the hate speech detection task hinges upon the availability of rich and variable labeled data, which is hard to obtain. In this work, we present a new approach for data augmentation that uses as input real unlabelled data, which is carefully selected from online platforms where invited hate speech is abundant. We show that by harvesting and processing this data (in an automatic manner), one can augment existing manually-labeled datasets to improve the classification performance of hate speech classification models. We observed an improvement in F1-score ranging from 2.7% and up to 9.5%, depending on the task (in- or cross-domain) and the model used.
%R 10.18653/v1/2022.findings-emnlp.165
%U https://aclanthology.org/2022.findings-emnlp.165
%U https://doi.org/10.18653/v1/2022.findings-emnlp.165
%P 2241-2248
Markdown (Informal)
[HARALD: Augmenting Hate Speech Data Sets with Real Data](https://aclanthology.org/2022.findings-emnlp.165) (Ilan & Vilenchik, Findings 2022)
ACL