@inproceedings{van-supranes-etal-2025-enhancing,
title = "Enhancing Hate Speech Classifiers through a Gradient-assisted Counterfactual Text Generation Strategy",
author = "Van Supranes, Michael and
Peng, Shaowen and
Wakamiya, Shoko and
Aramaki, Eiji",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.189/",
doi = "10.18653/v1/2025.findings-emnlp.189",
pages = "3529--3544",
ISBN = "979-8-89176-335-7",
abstract = "Counterfactual data augmentation (CDA) is a promising strategy for improving hate speech classification, but automating counterfactual text generation remains a challenge. Strong attribute control can distort meaning, while prioritizing semantic preservation may weaken attribute alignment. We propose **Gradient-assisted Energy-based Sampling (GENES)** for counterfactual text generation, which restricts accepted samples to text meeting a minimum BERTScore threshold and applies gradient-assisted proposal generation to improve attribute alignment. Compared to other methods that solely rely on either prompting, gradient-based steering, or energy-based sampling, GENES is more likely to jointly satisfy attribute alignment and semantic preservation under the same base model. When applied to data augmentation, GENES achieved the best macro F1-score in two of three test sets, and it improved robustness in detecting targeted abusive language. In some cases, GENES exceeded the performance of prompt-based methods using a GPT-4o-mini, despite relying on a smaller model (Flan-T5-Large). Based on our cross-dataset evaluation, the average performance of models aided by GENES is the best among those methods that rely on a smaller model (Flan-T5-L). These results position GENES as a possible lightweight and open-source alternative."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="van-supranes-etal-2025-enhancing">
<titleInfo>
<title>Enhancing Hate Speech Classifiers through a Gradient-assisted Counterfactual Text Generation Strategy</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Van Supranes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shaowen</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shoko</namePart>
<namePart type="family">Wakamiya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eiji</namePart>
<namePart type="family">Aramaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Counterfactual data augmentation (CDA) is a promising strategy for improving hate speech classification, but automating counterfactual text generation remains a challenge. Strong attribute control can distort meaning, while prioritizing semantic preservation may weaken attribute alignment. We propose **Gradient-assisted Energy-based Sampling (GENES)** for counterfactual text generation, which restricts accepted samples to text meeting a minimum BERTScore threshold and applies gradient-assisted proposal generation to improve attribute alignment. Compared to other methods that solely rely on either prompting, gradient-based steering, or energy-based sampling, GENES is more likely to jointly satisfy attribute alignment and semantic preservation under the same base model. When applied to data augmentation, GENES achieved the best macro F1-score in two of three test sets, and it improved robustness in detecting targeted abusive language. In some cases, GENES exceeded the performance of prompt-based methods using a GPT-4o-mini, despite relying on a smaller model (Flan-T5-Large). Based on our cross-dataset evaluation, the average performance of models aided by GENES is the best among those methods that rely on a smaller model (Flan-T5-L). These results position GENES as a possible lightweight and open-source alternative.</abstract>
<identifier type="citekey">van-supranes-etal-2025-enhancing</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.189</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.189/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>3529</start>
<end>3544</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Enhancing Hate Speech Classifiers through a Gradient-assisted Counterfactual Text Generation Strategy
%A Van Supranes, Michael
%A Peng, Shaowen
%A Wakamiya, Shoko
%A Aramaki, Eiji
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F van-supranes-etal-2025-enhancing
%X Counterfactual data augmentation (CDA) is a promising strategy for improving hate speech classification, but automating counterfactual text generation remains a challenge. Strong attribute control can distort meaning, while prioritizing semantic preservation may weaken attribute alignment. We propose **Gradient-assisted Energy-based Sampling (GENES)** for counterfactual text generation, which restricts accepted samples to text meeting a minimum BERTScore threshold and applies gradient-assisted proposal generation to improve attribute alignment. Compared to other methods that solely rely on either prompting, gradient-based steering, or energy-based sampling, GENES is more likely to jointly satisfy attribute alignment and semantic preservation under the same base model. When applied to data augmentation, GENES achieved the best macro F1-score in two of three test sets, and it improved robustness in detecting targeted abusive language. In some cases, GENES exceeded the performance of prompt-based methods using a GPT-4o-mini, despite relying on a smaller model (Flan-T5-Large). Based on our cross-dataset evaluation, the average performance of models aided by GENES is the best among those methods that rely on a smaller model (Flan-T5-L). These results position GENES as a possible lightweight and open-source alternative.
%R 10.18653/v1/2025.findings-emnlp.189
%U https://aclanthology.org/2025.findings-emnlp.189/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.189
%P 3529-3544
Markdown (Informal)
[Enhancing Hate Speech Classifiers through a Gradient-assisted Counterfactual Text Generation Strategy](https://aclanthology.org/2025.findings-emnlp.189/) (Van Supranes et al., Findings 2025)
ACL