@inproceedings{attanasio-etal-2022-benchmarking,
title = "Benchmarking Post-Hoc Interpretability Approaches for Transformer-based Misogyny Detection",
author = "Attanasio, Giuseppe and
Nozza, Debora and
Pastor, Eliana and
Hovy, Dirk",
editor = "Shavrina, Tatiana and
Mikhailov, Vladislav and
Malykh, Valentin and
Artemova, Ekaterina and
Serikov, Oleg and
Protasov, Vitaly",
booktitle = "Proceedings of NLP Power! The First Workshop on Efficient Benchmarking in NLP",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.nlppower-1.11",
doi = "10.18653/v1/2022.nlppower-1.11",
pages = "100--112",
abstract = "Transformer-based Natural Language Processing models have become the standard for hate speech detection. However, the unconscious use of these techniques for such a critical task comes with negative consequences. Various works have demonstrated that hate speech classifiers are biased. These findings have prompted efforts to explain classifiers, mainly using attribution methods. In this paper, we provide the first benchmark study of interpretability approaches for hate speech detection. We cover four post-hoc token attribution approaches to explain the predictions of Transformer-based misogyny classifiers in English and Italian. Further, we compare generated attributions to attention analysis. We find that only two algorithms provide faithful explanations aligned with human expectations. Gradient-based methods and attention, however, show inconsistent outputs, making their value for explanations questionable for hate speech detection tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="attanasio-etal-2022-benchmarking">
<titleInfo>
<title>Benchmarking Post-Hoc Interpretability Approaches for Transformer-based Misogyny Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Attanasio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debora</namePart>
<namePart type="family">Nozza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eliana</namePart>
<namePart type="family">Pastor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dirk</namePart>
<namePart type="family">Hovy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of NLP Power! The First Workshop on Efficient Benchmarking in NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tatiana</namePart>
<namePart type="family">Shavrina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vladislav</namePart>
<namePart type="family">Mikhailov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valentin</namePart>
<namePart type="family">Malykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Artemova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Serikov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vitaly</namePart>
<namePart type="family">Protasov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Transformer-based Natural Language Processing models have become the standard for hate speech detection. However, the unconscious use of these techniques for such a critical task comes with negative consequences. Various works have demonstrated that hate speech classifiers are biased. These findings have prompted efforts to explain classifiers, mainly using attribution methods. In this paper, we provide the first benchmark study of interpretability approaches for hate speech detection. We cover four post-hoc token attribution approaches to explain the predictions of Transformer-based misogyny classifiers in English and Italian. Further, we compare generated attributions to attention analysis. We find that only two algorithms provide faithful explanations aligned with human expectations. Gradient-based methods and attention, however, show inconsistent outputs, making their value for explanations questionable for hate speech detection tasks.</abstract>
<identifier type="citekey">attanasio-etal-2022-benchmarking</identifier>
<identifier type="doi">10.18653/v1/2022.nlppower-1.11</identifier>
<location>
<url>https://aclanthology.org/2022.nlppower-1.11</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>100</start>
<end>112</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Benchmarking Post-Hoc Interpretability Approaches for Transformer-based Misogyny Detection
%A Attanasio, Giuseppe
%A Nozza, Debora
%A Pastor, Eliana
%A Hovy, Dirk
%Y Shavrina, Tatiana
%Y Mikhailov, Vladislav
%Y Malykh, Valentin
%Y Artemova, Ekaterina
%Y Serikov, Oleg
%Y Protasov, Vitaly
%S Proceedings of NLP Power! The First Workshop on Efficient Benchmarking in NLP
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F attanasio-etal-2022-benchmarking
%X Transformer-based Natural Language Processing models have become the standard for hate speech detection. However, the unconscious use of these techniques for such a critical task comes with negative consequences. Various works have demonstrated that hate speech classifiers are biased. These findings have prompted efforts to explain classifiers, mainly using attribution methods. In this paper, we provide the first benchmark study of interpretability approaches for hate speech detection. We cover four post-hoc token attribution approaches to explain the predictions of Transformer-based misogyny classifiers in English and Italian. Further, we compare generated attributions to attention analysis. We find that only two algorithms provide faithful explanations aligned with human expectations. Gradient-based methods and attention, however, show inconsistent outputs, making their value for explanations questionable for hate speech detection tasks.
%R 10.18653/v1/2022.nlppower-1.11
%U https://aclanthology.org/2022.nlppower-1.11
%U https://doi.org/10.18653/v1/2022.nlppower-1.11
%P 100-112
Markdown (Informal)
[Benchmarking Post-Hoc Interpretability Approaches for Transformer-based Misogyny Detection](https://aclanthology.org/2022.nlppower-1.11) (Attanasio et al., nlppower 2022)
ACL