@inproceedings{ocampo-etal-2023-playing,
title = "Playing the Part of the Sharp Bully: Generating Adversarial Examples for Implicit Hate Speech Detection",
author = "Ocampo, Nicol{\'a}s Benjam{\'\i}n and
Cabrio, Elena and
Villata, Serena",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.173",
doi = "10.18653/v1/2023.findings-acl.173",
pages = "2758--2772",
abstract = "Research on abusive content detection on social media has primarily focused on explicit forms of hate speech (HS), that are often identifiable by recognizing hateful words and expressions. Messages containing linguistically subtle and implicit forms of hate speech still constitute an open challenge for automatic hate speech detection. In this paper, we propose a new framework for generating adversarial implicit HS short-text messages using Auto-regressive Language Models. Moreover, we propose a strategy to group the generated implicit messages in complexity levels (EASY, MEDIUM, and HARD categories) characterizing how challenging these messages are for supervised classifiers. Finally, relying on (Dinan et al., 2019; Vidgen et al., 2021), we propose a {``}build it, break it, fix it{''}, training scheme using HARD messages showing how iteratively retraining on HARD messages substantially leverages SOTA models{'} performances on implicit HS benchmarks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ocampo-etal-2023-playing">
<titleInfo>
<title>Playing the Part of the Sharp Bully: Generating Adversarial Examples for Implicit Hate Speech Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicolás</namePart>
<namePart type="given">Benjamín</namePart>
<namePart type="family">Ocampo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Cabrio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Serena</namePart>
<namePart type="family">Villata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Research on abusive content detection on social media has primarily focused on explicit forms of hate speech (HS), that are often identifiable by recognizing hateful words and expressions. Messages containing linguistically subtle and implicit forms of hate speech still constitute an open challenge for automatic hate speech detection. In this paper, we propose a new framework for generating adversarial implicit HS short-text messages using Auto-regressive Language Models. Moreover, we propose a strategy to group the generated implicit messages in complexity levels (EASY, MEDIUM, and HARD categories) characterizing how challenging these messages are for supervised classifiers. Finally, relying on (Dinan et al., 2019; Vidgen et al., 2021), we propose a “build it, break it, fix it”, training scheme using HARD messages showing how iteratively retraining on HARD messages substantially leverages SOTA models’ performances on implicit HS benchmarks.</abstract>
<identifier type="citekey">ocampo-etal-2023-playing</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.173</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.173</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>2758</start>
<end>2772</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Playing the Part of the Sharp Bully: Generating Adversarial Examples for Implicit Hate Speech Detection
%A Ocampo, Nicolás Benjamín
%A Cabrio, Elena
%A Villata, Serena
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F ocampo-etal-2023-playing
%X Research on abusive content detection on social media has primarily focused on explicit forms of hate speech (HS), that are often identifiable by recognizing hateful words and expressions. Messages containing linguistically subtle and implicit forms of hate speech still constitute an open challenge for automatic hate speech detection. In this paper, we propose a new framework for generating adversarial implicit HS short-text messages using Auto-regressive Language Models. Moreover, we propose a strategy to group the generated implicit messages in complexity levels (EASY, MEDIUM, and HARD categories) characterizing how challenging these messages are for supervised classifiers. Finally, relying on (Dinan et al., 2019; Vidgen et al., 2021), we propose a “build it, break it, fix it”, training scheme using HARD messages showing how iteratively retraining on HARD messages substantially leverages SOTA models’ performances on implicit HS benchmarks.
%R 10.18653/v1/2023.findings-acl.173
%U https://aclanthology.org/2023.findings-acl.173
%U https://doi.org/10.18653/v1/2023.findings-acl.173
%P 2758-2772
Markdown (Informal)
[Playing the Part of the Sharp Bully: Generating Adversarial Examples for Implicit Hate Speech Detection](https://aclanthology.org/2023.findings-acl.173) (Ocampo et al., Findings 2023)
ACL