@inproceedings{casula-etal-2024-delving,
title = "Delving into Qualitative Implications of Synthetic Data for Hate Speech Detection",
author = "Casula, Camilla and
Vecellio Salto, Sebastiano and
Ramponi, Alan and
Tonelli, Sara",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1099/",
doi = "10.18653/v1/2024.emnlp-main.1099",
pages = "19709--19726",
abstract = "The use of synthetic data for training models for a variety of NLP tasks is now widespread. However, previous work reports mixed results with regards to its effectiveness on highly subjective tasks such as hate speech detection. In this paper, we present an in-depth qualitative analysis of the potential and specific pitfalls of synthetic data for hate speech detection in English, with 3,500 manually annotated examples. We show that, across different models, synthetic data created through paraphrasing gold texts can improve out-of-distribution robustness from a computational standpoint. However, this comes at a cost: synthetic data fails to reliably reflect the characteristics of real-world data on a number of linguistic dimensions, it results in drastically different class distributions, and it heavily reduces the representation of both specific identity groups and intersectional hate."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="casula-etal-2024-delving">
<titleInfo>
<title>Delving into Qualitative Implications of Synthetic Data for Hate Speech Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Camilla</namePart>
<namePart type="family">Casula</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastiano</namePart>
<namePart type="family">Vecellio Salto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ramponi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Tonelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The use of synthetic data for training models for a variety of NLP tasks is now widespread. However, previous work reports mixed results with regards to its effectiveness on highly subjective tasks such as hate speech detection. In this paper, we present an in-depth qualitative analysis of the potential and specific pitfalls of synthetic data for hate speech detection in English, with 3,500 manually annotated examples. We show that, across different models, synthetic data created through paraphrasing gold texts can improve out-of-distribution robustness from a computational standpoint. However, this comes at a cost: synthetic data fails to reliably reflect the characteristics of real-world data on a number of linguistic dimensions, it results in drastically different class distributions, and it heavily reduces the representation of both specific identity groups and intersectional hate.</abstract>
<identifier type="citekey">casula-etal-2024-delving</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.1099</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1099/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>19709</start>
<end>19726</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Delving into Qualitative Implications of Synthetic Data for Hate Speech Detection
%A Casula, Camilla
%A Vecellio Salto, Sebastiano
%A Ramponi, Alan
%A Tonelli, Sara
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F casula-etal-2024-delving
%X The use of synthetic data for training models for a variety of NLP tasks is now widespread. However, previous work reports mixed results with regards to its effectiveness on highly subjective tasks such as hate speech detection. In this paper, we present an in-depth qualitative analysis of the potential and specific pitfalls of synthetic data for hate speech detection in English, with 3,500 manually annotated examples. We show that, across different models, synthetic data created through paraphrasing gold texts can improve out-of-distribution robustness from a computational standpoint. However, this comes at a cost: synthetic data fails to reliably reflect the characteristics of real-world data on a number of linguistic dimensions, it results in drastically different class distributions, and it heavily reduces the representation of both specific identity groups and intersectional hate.
%R 10.18653/v1/2024.emnlp-main.1099
%U https://aclanthology.org/2024.emnlp-main.1099/
%U https://doi.org/10.18653/v1/2024.emnlp-main.1099
%P 19709-19726
Markdown (Informal)
[Delving into Qualitative Implications of Synthetic Data for Hate Speech Detection](https://aclanthology.org/2024.emnlp-main.1099/) (Casula et al., EMNLP 2024)
ACL