@inproceedings{casula-etal-2024-dont,
title = "Don{'}t Augment, Rewrite? Assessing Abusive Language Detection with Synthetic Data",
author = "Casula, Camilla and
Leonardelli, Elisa and
Tonelli, Sara",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.669",
doi = "10.18653/v1/2024.findings-acl.669",
pages = "11240--11247",
abstract = "Research on abusive language detection and content moderation is crucial to combat online harm. However, current limitations set by regulatory bodies and social media platforms can make it difficult to share collected data. We address this challenge by exploring the possibility to replace existing datasets in English for abusive language detection with synthetic data obtained by rewriting original texts with an instruction-based generative model.We show that such data can be effectively used to train a classifier whose performance is in line, and sometimes better, than a classifier trained on original data. Training with synthetic data also seems to improve robustness in a cross-dataset setting. A manual inspection of the generated data confirms that rewriting makes it impossible to retrieve the original texts online.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="casula-etal-2024-dont">
<titleInfo>
<title>Don’t Augment, Rewrite? Assessing Abusive Language Detection with Synthetic Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Camilla</namePart>
<namePart type="family">Casula</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elisa</namePart>
<namePart type="family">Leonardelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Tonelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Research on abusive language detection and content moderation is crucial to combat online harm. However, current limitations set by regulatory bodies and social media platforms can make it difficult to share collected data. We address this challenge by exploring the possibility to replace existing datasets in English for abusive language detection with synthetic data obtained by rewriting original texts with an instruction-based generative model.We show that such data can be effectively used to train a classifier whose performance is in line, and sometimes better, than a classifier trained on original data. Training with synthetic data also seems to improve robustness in a cross-dataset setting. A manual inspection of the generated data confirms that rewriting makes it impossible to retrieve the original texts online.</abstract>
<identifier type="citekey">casula-etal-2024-dont</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.669</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.669</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>11240</start>
<end>11247</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Don’t Augment, Rewrite? Assessing Abusive Language Detection with Synthetic Data
%A Casula, Camilla
%A Leonardelli, Elisa
%A Tonelli, Sara
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F casula-etal-2024-dont
%X Research on abusive language detection and content moderation is crucial to combat online harm. However, current limitations set by regulatory bodies and social media platforms can make it difficult to share collected data. We address this challenge by exploring the possibility to replace existing datasets in English for abusive language detection with synthetic data obtained by rewriting original texts with an instruction-based generative model.We show that such data can be effectively used to train a classifier whose performance is in line, and sometimes better, than a classifier trained on original data. Training with synthetic data also seems to improve robustness in a cross-dataset setting. A manual inspection of the generated data confirms that rewriting makes it impossible to retrieve the original texts online.
%R 10.18653/v1/2024.findings-acl.669
%U https://aclanthology.org/2024.findings-acl.669
%U https://doi.org/10.18653/v1/2024.findings-acl.669
%P 11240-11247
Markdown (Informal)
[Don’t Augment, Rewrite? Assessing Abusive Language Detection with Synthetic Data](https://aclanthology.org/2024.findings-acl.669) (Casula et al., Findings 2024)
ACL