@inproceedings{piot-parapar-2025-decoding,
title = "Decoding Hate: Exploring Language Models' Reactions to Hate Speech",
author = "Piot, Paloma and
Parapar, Javier",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.45/",
doi = "10.18653/v1/2025.naacl-long.45",
pages = "973--990",
ISBN = "979-8-89176-189-6",
abstract = "Hate speech is a harmful form of online expression, often manifesting as derogatory posts. It is a significant risk in digital environments. With the rise of Large Language Models (LLMs), there is concern about their potential to replicate hate speech patterns, given their training on vast amounts of unmoderated internet data. Understanding how LLMs respond to hate speech is crucial for their responsible deployment. However, the behaviour of LLMs towards hate speech has been limited compared. This paper investigates the reactions of seven state-of-the-art LLMs (LLaMA 2, Vicuna, LLaMA 3, Mistral, GPT-3.5, GPT-4, and Gemini Pro) to hate speech. Through qualitative analysis, we aim to reveal the spectrum of responses these models produce, highlighting their capacity to handle hate speech inputs. We also discuss strategies to mitigate hate speech generation by LLMs, particularly through fine-tuning and guideline guardrailing. Finally, we explore the models' responses to hate speech framed in politically correct language."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="piot-parapar-2025-decoding">
<titleInfo>
<title>Decoding Hate: Exploring Language Models’ Reactions to Hate Speech</title>
</titleInfo>
<name type="personal">
<namePart type="given">Paloma</namePart>
<namePart type="family">Piot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Javier</namePart>
<namePart type="family">Parapar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Hate speech is a harmful form of online expression, often manifesting as derogatory posts. It is a significant risk in digital environments. With the rise of Large Language Models (LLMs), there is concern about their potential to replicate hate speech patterns, given their training on vast amounts of unmoderated internet data. Understanding how LLMs respond to hate speech is crucial for their responsible deployment. However, the behaviour of LLMs towards hate speech has been limited compared. This paper investigates the reactions of seven state-of-the-art LLMs (LLaMA 2, Vicuna, LLaMA 3, Mistral, GPT-3.5, GPT-4, and Gemini Pro) to hate speech. Through qualitative analysis, we aim to reveal the spectrum of responses these models produce, highlighting their capacity to handle hate speech inputs. We also discuss strategies to mitigate hate speech generation by LLMs, particularly through fine-tuning and guideline guardrailing. Finally, we explore the models’ responses to hate speech framed in politically correct language.</abstract>
<identifier type="citekey">piot-parapar-2025-decoding</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.45</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.45/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>973</start>
<end>990</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Decoding Hate: Exploring Language Models’ Reactions to Hate Speech
%A Piot, Paloma
%A Parapar, Javier
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F piot-parapar-2025-decoding
%X Hate speech is a harmful form of online expression, often manifesting as derogatory posts. It is a significant risk in digital environments. With the rise of Large Language Models (LLMs), there is concern about their potential to replicate hate speech patterns, given their training on vast amounts of unmoderated internet data. Understanding how LLMs respond to hate speech is crucial for their responsible deployment. However, the behaviour of LLMs towards hate speech has been limited compared. This paper investigates the reactions of seven state-of-the-art LLMs (LLaMA 2, Vicuna, LLaMA 3, Mistral, GPT-3.5, GPT-4, and Gemini Pro) to hate speech. Through qualitative analysis, we aim to reveal the spectrum of responses these models produce, highlighting their capacity to handle hate speech inputs. We also discuss strategies to mitigate hate speech generation by LLMs, particularly through fine-tuning and guideline guardrailing. Finally, we explore the models’ responses to hate speech framed in politically correct language.
%R 10.18653/v1/2025.naacl-long.45
%U https://aclanthology.org/2025.naacl-long.45/
%U https://doi.org/10.18653/v1/2025.naacl-long.45
%P 973-990
Markdown (Informal)
[Decoding Hate: Exploring Language Models’ Reactions to Hate Speech](https://aclanthology.org/2025.naacl-long.45/) (Piot & Parapar, NAACL 2025)
ACL
- Paloma Piot and Javier Parapar. 2025. Decoding Hate: Exploring Language Models’ Reactions to Hate Speech. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 973–990, Albuquerque, New Mexico. Association for Computational Linguistics.