@inproceedings{bespalov-etal-2023-towards,
title = "Towards Building a Robust Toxicity Predictor",
author = "Bespalov, Dmitriy and
Bhabesh, Sourav and
Xiang, Yi and
Zhou, Liutong and
Qi, Yanjun",
editor = "Sitaram, Sunayana and
Beigman Klebanov, Beata and
Williams, Jason D",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-industry.56/",
doi = "10.18653/v1/2023.acl-industry.56",
pages = "581--598",
abstract = "Recent NLP literature pays little attention to the robustness of toxicity language predictors, while these systems are most likely to be used in adversarial contexts. This paper presents a novel adversarial attack, {\textbackslash}texttt{\{}ToxicTrap{\}}, introducing small word-level perturbations to fool SOTA text classifiers to predict toxic text samples as benign. {\textbackslash}texttt{\{}ToxicTrap{\}} exploits greedy based search strategies to enable fast and effective generation of toxic adversarial examples. Two novel goal function designs allow {\textbackslash}texttt{\{}ToxicTrap{\}} to identify weaknesses in both multiclass and multilabel toxic language detectors. Our empirical results show that SOTA toxicity text classifiers are indeed vulnerable to the proposed attacks, attaining over 98{\textbackslash}{\%} attack success rates in multilabel cases. We also show how a vanilla adversarial training and its improved version can help increase robustness of a toxicity detector even against unseen attacks."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bespalov-etal-2023-towards">
<titleInfo>
<title>Towards Building a Robust Toxicity Predictor</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dmitriy</namePart>
<namePart type="family">Bespalov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sourav</namePart>
<namePart type="family">Bhabesh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Xiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liutong</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanjun</namePart>
<namePart type="family">Qi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sunayana</namePart>
<namePart type="family">Sitaram</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Beata</namePart>
<namePart type="family">Beigman Klebanov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jason</namePart>
<namePart type="given">D</namePart>
<namePart type="family">Williams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent NLP literature pays little attention to the robustness of toxicity language predictors, while these systems are most likely to be used in adversarial contexts. This paper presents a novel adversarial attack, \textbackslashtexttt{ToxicTrap}, introducing small word-level perturbations to fool SOTA text classifiers to predict toxic text samples as benign. \textbackslashtexttt{ToxicTrap} exploits greedy based search strategies to enable fast and effective generation of toxic adversarial examples. Two novel goal function designs allow \textbackslashtexttt{ToxicTrap} to identify weaknesses in both multiclass and multilabel toxic language detectors. Our empirical results show that SOTA toxicity text classifiers are indeed vulnerable to the proposed attacks, attaining over 98\textbackslash% attack success rates in multilabel cases. We also show how a vanilla adversarial training and its improved version can help increase robustness of a toxicity detector even against unseen attacks.</abstract>
<identifier type="citekey">bespalov-etal-2023-towards</identifier>
<identifier type="doi">10.18653/v1/2023.acl-industry.56</identifier>
<location>
<url>https://aclanthology.org/2023.acl-industry.56/</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>581</start>
<end>598</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Building a Robust Toxicity Predictor
%A Bespalov, Dmitriy
%A Bhabesh, Sourav
%A Xiang, Yi
%A Zhou, Liutong
%A Qi, Yanjun
%Y Sitaram, Sunayana
%Y Beigman Klebanov, Beata
%Y Williams, Jason D.
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F bespalov-etal-2023-towards
%X Recent NLP literature pays little attention to the robustness of toxicity language predictors, while these systems are most likely to be used in adversarial contexts. This paper presents a novel adversarial attack, \textbackslashtexttt{ToxicTrap}, introducing small word-level perturbations to fool SOTA text classifiers to predict toxic text samples as benign. \textbackslashtexttt{ToxicTrap} exploits greedy based search strategies to enable fast and effective generation of toxic adversarial examples. Two novel goal function designs allow \textbackslashtexttt{ToxicTrap} to identify weaknesses in both multiclass and multilabel toxic language detectors. Our empirical results show that SOTA toxicity text classifiers are indeed vulnerable to the proposed attacks, attaining over 98\textbackslash% attack success rates in multilabel cases. We also show how a vanilla adversarial training and its improved version can help increase robustness of a toxicity detector even against unseen attacks.
%R 10.18653/v1/2023.acl-industry.56
%U https://aclanthology.org/2023.acl-industry.56/
%U https://doi.org/10.18653/v1/2023.acl-industry.56
%P 581-598
Markdown (Informal)
[Towards Building a Robust Toxicity Predictor](https://aclanthology.org/2023.acl-industry.56/) (Bespalov et al., ACL 2023)
ACL
- Dmitriy Bespalov, Sourav Bhabesh, Yi Xiang, Liutong Zhou, and Yanjun Qi. 2023. Towards Building a Robust Toxicity Predictor. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track), pages 581–598, Toronto, Canada. Association for Computational Linguistics.