@inproceedings{tiwari-etal-2022-robust,
title = "Robust Hate Speech Detection via Mitigating Spurious Correlations",
author = "Tiwari, Kshitiz and
Yuan, Shuhan and
Zhang, Lu",
editor = "He, Yulan and
Ji, Heng and
Li, Sujian and
Liu, Yang and
Chang, Chua-Hui",
booktitle = "Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)",
month = nov,
year = "2022",
address = "Online only",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.aacl-short.7/",
doi = "10.18653/v1/2022.aacl-short.7",
pages = "51--56",
abstract = "We develop a novel robust hate speech detection model that can defend against both word- and character-level adversarial attacks. We identify the essential factor that vanilla detection models are vulnerable to adversarial attacks is the spurious correlation between certain target words in the text and the prediction label. To mitigate such spurious correlation, we describe the process of hate speech detection by a causal graph. Then, we employ the causal strength to quantify the spurious correlation and formulate a regularized entropy loss function. We show that our method generalizes the backdoor adjustment technique in causal inference. Finally, the empirical evaluation shows the efficacy of our method."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tiwari-etal-2022-robust">
<titleInfo>
<title>Robust Hate Speech Detection via Mitigating Spurious Correlations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kshitiz</namePart>
<namePart type="family">Tiwari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuhan</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sujian</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chua-Hui</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online only</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We develop a novel robust hate speech detection model that can defend against both word- and character-level adversarial attacks. We identify the essential factor that vanilla detection models are vulnerable to adversarial attacks is the spurious correlation between certain target words in the text and the prediction label. To mitigate such spurious correlation, we describe the process of hate speech detection by a causal graph. Then, we employ the causal strength to quantify the spurious correlation and formulate a regularized entropy loss function. We show that our method generalizes the backdoor adjustment technique in causal inference. Finally, the empirical evaluation shows the efficacy of our method.</abstract>
<identifier type="citekey">tiwari-etal-2022-robust</identifier>
<identifier type="doi">10.18653/v1/2022.aacl-short.7</identifier>
<location>
<url>https://aclanthology.org/2022.aacl-short.7/</url>
</location>
<part>
<date>2022-11</date>
<extent unit="page">
<start>51</start>
<end>56</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Robust Hate Speech Detection via Mitigating Spurious Correlations
%A Tiwari, Kshitiz
%A Yuan, Shuhan
%A Zhang, Lu
%Y He, Yulan
%Y Ji, Heng
%Y Li, Sujian
%Y Liu, Yang
%Y Chang, Chua-Hui
%S Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)
%D 2022
%8 November
%I Association for Computational Linguistics
%C Online only
%F tiwari-etal-2022-robust
%X We develop a novel robust hate speech detection model that can defend against both word- and character-level adversarial attacks. We identify the essential factor that vanilla detection models are vulnerable to adversarial attacks is the spurious correlation between certain target words in the text and the prediction label. To mitigate such spurious correlation, we describe the process of hate speech detection by a causal graph. Then, we employ the causal strength to quantify the spurious correlation and formulate a regularized entropy loss function. We show that our method generalizes the backdoor adjustment technique in causal inference. Finally, the empirical evaluation shows the efficacy of our method.
%R 10.18653/v1/2022.aacl-short.7
%U https://aclanthology.org/2022.aacl-short.7/
%U https://doi.org/10.18653/v1/2022.aacl-short.7
%P 51-56
Markdown (Informal)
[Robust Hate Speech Detection via Mitigating Spurious Correlations](https://aclanthology.org/2022.aacl-short.7/) (Tiwari et al., AACL-IJCNLP 2022)
ACL
- Kshitiz Tiwari, Shuhan Yuan, and Lu Zhang. 2022. Robust Hate Speech Detection via Mitigating Spurious Correlations. In Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 2: Short Papers), pages 51–56, Online only. Association for Computational Linguistics.