@inproceedings{vecchi-etal-2026-harm,
title = "{HARM}: Learning Hate-Aware Reward Model for Evaluating Natural Language Explanations of Offensive Content",
author = "Vecchi, Lorenzo Puppi and
Jr., Alceu De Souza Britto and
Paraiso, Emerson Cabrera and
M. O. Cruz, Rafael",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.230/",
pages = "4393--4431",
ISBN = "979-8-89176-386-9",
abstract = "Explaining why content is hateful using natural language is crucial for fostering transparency in automated content moderation systems. However, evaluating the quality of such explanations remains an open challenge. General-purpose reward models (RMs), commonly used for scoring natural language outputs, are typically optimized for broad notions of safety. We argue that this optimization penalizes situations where references to stereotypes or offensive content are essential for explanations with higher explanatory fidelity. To address this gap, we introduce SBIC-Explain, a human-validated dataset of 370,788 LLM generated NLEs for offensive content, spanning three levels of human-annotated contextual richness: Tier 1: text-only, Tier 2: + classification-aware, and Tier 3: + semantics-informed. We hypothesize that as human-annotated context increases, explanations should lead to higher perceived explanations with higher explanatory fidelity. Yet, we find that existing RMs systematically assign lower scores to more contextually rich (and often more offensive) explanations, revealing a misalignment between model preferences and explanatory fidelity for this context. We propose HARM (Hate-Aware Reward Model), a RM that integrates interpretable signals to better align reward scores with the needs of hate speech explanation. HARM outperforms general-purpose baselines, improving NLE pair-wise preference. Available at: https://github.com/Lorenzo815/HARM."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vecchi-etal-2026-harm">
<titleInfo>
<title>HARM: Learning Hate-Aware Reward Model for Evaluating Natural Language Explanations of Offensive Content</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lorenzo</namePart>
<namePart type="given">Puppi</namePart>
<namePart type="family">Vecchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alceu</namePart>
<namePart type="given">De</namePart>
<namePart type="given">Souza</namePart>
<namePart type="given">Britto</namePart>
<namePart type="family">Jr.</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emerson</namePart>
<namePart type="given">Cabrera</namePart>
<namePart type="family">Paraiso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rafael</namePart>
<namePart type="family">M. O. Cruz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>Explaining why content is hateful using natural language is crucial for fostering transparency in automated content moderation systems. However, evaluating the quality of such explanations remains an open challenge. General-purpose reward models (RMs), commonly used for scoring natural language outputs, are typically optimized for broad notions of safety. We argue that this optimization penalizes situations where references to stereotypes or offensive content are essential for explanations with higher explanatory fidelity. To address this gap, we introduce SBIC-Explain, a human-validated dataset of 370,788 LLM generated NLEs for offensive content, spanning three levels of human-annotated contextual richness: Tier 1: text-only, Tier 2: + classification-aware, and Tier 3: + semantics-informed. We hypothesize that as human-annotated context increases, explanations should lead to higher perceived explanations with higher explanatory fidelity. Yet, we find that existing RMs systematically assign lower scores to more contextually rich (and often more offensive) explanations, revealing a misalignment between model preferences and explanatory fidelity for this context. We propose HARM (Hate-Aware Reward Model), a RM that integrates interpretable signals to better align reward scores with the needs of hate speech explanation. HARM outperforms general-purpose baselines, improving NLE pair-wise preference. Available at: https://github.com/Lorenzo815/HARM.</abstract>
<identifier type="citekey">vecchi-etal-2026-harm</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.230/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>4393</start>
<end>4431</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HARM: Learning Hate-Aware Reward Model for Evaluating Natural Language Explanations of Offensive Content
%A Vecchi, Lorenzo Puppi
%A Jr., Alceu De Souza Britto
%A Paraiso, Emerson Cabrera
%A M. O. Cruz, Rafael
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F vecchi-etal-2026-harm
%X Explaining why content is hateful using natural language is crucial for fostering transparency in automated content moderation systems. However, evaluating the quality of such explanations remains an open challenge. General-purpose reward models (RMs), commonly used for scoring natural language outputs, are typically optimized for broad notions of safety. We argue that this optimization penalizes situations where references to stereotypes or offensive content are essential for explanations with higher explanatory fidelity. To address this gap, we introduce SBIC-Explain, a human-validated dataset of 370,788 LLM generated NLEs for offensive content, spanning three levels of human-annotated contextual richness: Tier 1: text-only, Tier 2: + classification-aware, and Tier 3: + semantics-informed. We hypothesize that as human-annotated context increases, explanations should lead to higher perceived explanations with higher explanatory fidelity. Yet, we find that existing RMs systematically assign lower scores to more contextually rich (and often more offensive) explanations, revealing a misalignment between model preferences and explanatory fidelity for this context. We propose HARM (Hate-Aware Reward Model), a RM that integrates interpretable signals to better align reward scores with the needs of hate speech explanation. HARM outperforms general-purpose baselines, improving NLE pair-wise preference. Available at: https://github.com/Lorenzo815/HARM.
%U https://aclanthology.org/2026.findings-eacl.230/
%P 4393-4431
Markdown (Informal)
[HARM: Learning Hate-Aware Reward Model for Evaluating Natural Language Explanations of Offensive Content](https://aclanthology.org/2026.findings-eacl.230/) (Vecchi et al., Findings 2026)
ACL