@inproceedings{parent-etal-2025-interaction,
title = "On the Interaction of Identity Hate Classification and Data Bias",
author = {Parent, Donnie and
Georgiades, Nina and
Mishra, Charvi and
Mohammed, Khaled and
K{\"u}bler, Sandra},
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-1.103/",
pages = "900--906",
abstract = "Hate speech detection is a task where machine learning models tend to be limited by the biases introduced by the dataset. We use two existing datasets of hate speech towards identity groups, the one by Wiegand et al. (2022) and a reannotated subset of the data in AbuseEval (Caselli et al. 2020). Since the data by Wiegand et al. (2022) were collected using one syntactic pattern, there exists a possible syntactic bias in this dataset. We test whether there exists such a bias by using a more syntactically general dataset for testing. Our findings show that classifiers trained on the dataset with the syntactic bias and tested on a less constrained dataset suffer from a loss in performance in the order of 20 points. Further experiments show that this drop can only be partly attributed to a shift in identity groups between datasets."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="parent-etal-2025-interaction">
<titleInfo>
<title>On the Interaction of Identity Hate Classification and Data Bias</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donnie</namePart>
<namePart type="family">Parent</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nina</namePart>
<namePart type="family">Georgiades</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Charvi</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khaled</namePart>
<namePart type="family">Mohammed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandra</namePart>
<namePart type="family">Kübler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kunilovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Escribe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Hate speech detection is a task where machine learning models tend to be limited by the biases introduced by the dataset. We use two existing datasets of hate speech towards identity groups, the one by Wiegand et al. (2022) and a reannotated subset of the data in AbuseEval (Caselli et al. 2020). Since the data by Wiegand et al. (2022) were collected using one syntactic pattern, there exists a possible syntactic bias in this dataset. We test whether there exists such a bias by using a more syntactically general dataset for testing. Our findings show that classifiers trained on the dataset with the syntactic bias and tested on a less constrained dataset suffer from a loss in performance in the order of 20 points. Further experiments show that this drop can only be partly attributed to a shift in identity groups between datasets.</abstract>
<identifier type="citekey">parent-etal-2025-interaction</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-1.103/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>900</start>
<end>906</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Interaction of Identity Hate Classification and Data Bias
%A Parent, Donnie
%A Georgiades, Nina
%A Mishra, Charvi
%A Mohammed, Khaled
%A Kübler, Sandra
%Y Angelova, Galia
%Y Kunilovskaya, Maria
%Y Escribe, Marie
%Y Mitkov, Ruslan
%S Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F parent-etal-2025-interaction
%X Hate speech detection is a task where machine learning models tend to be limited by the biases introduced by the dataset. We use two existing datasets of hate speech towards identity groups, the one by Wiegand et al. (2022) and a reannotated subset of the data in AbuseEval (Caselli et al. 2020). Since the data by Wiegand et al. (2022) were collected using one syntactic pattern, there exists a possible syntactic bias in this dataset. We test whether there exists such a bias by using a more syntactically general dataset for testing. Our findings show that classifiers trained on the dataset with the syntactic bias and tested on a less constrained dataset suffer from a loss in performance in the order of 20 points. Further experiments show that this drop can only be partly attributed to a shift in identity groups between datasets.
%U https://aclanthology.org/2025.ranlp-1.103/
%P 900-906
Markdown (Informal)
[On the Interaction of Identity Hate Classification and Data Bias](https://aclanthology.org/2025.ranlp-1.103/) (Parent et al., RANLP 2025)
ACL
- Donnie Parent, Nina Georgiades, Charvi Mishra, Khaled Mohammed, and Sandra Kübler. 2025. On the Interaction of Identity Hate Classification and Data Bias. In Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era, pages 900–906, Varna, Bulgaria. INCOMA Ltd., Shoumen, Bulgaria.