@inproceedings{sinha-etal-2021-perturbing,
title = "Perturbing Inputs for Fragile Interpretations in Deep Natural Language Processing",
author = "Sinha, Sanchit and
Chen, Hanjie and
Sekhon, Arshdeep and
Ji, Yangfeng and
Qi, Yanjun",
editor = "Bastings, Jasmijn and
Belinkov, Yonatan and
Dupoux, Emmanuel and
Giulianelli, Mario and
Hupkes, Dieuwke and
Pinter, Yuval and
Sajjad, Hassan",
booktitle = "Proceedings of the Fourth BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.blackboxnlp-1.33",
doi = "10.18653/v1/2021.blackboxnlp-1.33",
pages = "420--434",
abstract = "Interpretability methods like Integrated Gradient and LIME are popular choices for explaining natural language model predictions with relative word importance scores. These interpretations need to be robust for trustworthy NLP applications in high-stake areas like medicine or finance. Our paper demonstrates how interpretations can be manipulated by making simple word perturbations on an input text. Via a small portion of word-level swaps, these adversarial perturbations aim to make the resulting text semantically and spatially similar to its seed input (therefore sharing similar interpretations). Simultaneously, the generated examples achieve the same prediction label as the seed yet are given a substantially different explanation by the interpretation methods. Our experiments generate fragile interpretations to attack two SOTA interpretation methods, across three popular Transformer models and on three different NLP datasets. We observe that the rank order correlation and top-K intersection score drops by over 20{\%} when less than 10{\%} of words are perturbed on average. Further, rank-order correlation keeps decreasing as more words get perturbed. Furthermore, we demonstrate that candidates generated from our method have good quality metrics.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sinha-etal-2021-perturbing">
<titleInfo>
<title>Perturbing Inputs for Fragile Interpretations in Deep Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sanchit</namePart>
<namePart type="family">Sinha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanjie</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arshdeep</namePart>
<namePart type="family">Sekhon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yangfeng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanjun</namePart>
<namePart type="family">Qi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jasmijn</namePart>
<namePart type="family">Bastings</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Belinkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emmanuel</namePart>
<namePart type="family">Dupoux</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mario</namePart>
<namePart type="family">Giulianelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dieuwke</namePart>
<namePart type="family">Hupkes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuval</namePart>
<namePart type="family">Pinter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hassan</namePart>
<namePart type="family">Sajjad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Interpretability methods like Integrated Gradient and LIME are popular choices for explaining natural language model predictions with relative word importance scores. These interpretations need to be robust for trustworthy NLP applications in high-stake areas like medicine or finance. Our paper demonstrates how interpretations can be manipulated by making simple word perturbations on an input text. Via a small portion of word-level swaps, these adversarial perturbations aim to make the resulting text semantically and spatially similar to its seed input (therefore sharing similar interpretations). Simultaneously, the generated examples achieve the same prediction label as the seed yet are given a substantially different explanation by the interpretation methods. Our experiments generate fragile interpretations to attack two SOTA interpretation methods, across three popular Transformer models and on three different NLP datasets. We observe that the rank order correlation and top-K intersection score drops by over 20% when less than 10% of words are perturbed on average. Further, rank-order correlation keeps decreasing as more words get perturbed. Furthermore, we demonstrate that candidates generated from our method have good quality metrics.</abstract>
<identifier type="citekey">sinha-etal-2021-perturbing</identifier>
<identifier type="doi">10.18653/v1/2021.blackboxnlp-1.33</identifier>
<location>
<url>https://aclanthology.org/2021.blackboxnlp-1.33</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>420</start>
<end>434</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Perturbing Inputs for Fragile Interpretations in Deep Natural Language Processing
%A Sinha, Sanchit
%A Chen, Hanjie
%A Sekhon, Arshdeep
%A Ji, Yangfeng
%A Qi, Yanjun
%Y Bastings, Jasmijn
%Y Belinkov, Yonatan
%Y Dupoux, Emmanuel
%Y Giulianelli, Mario
%Y Hupkes, Dieuwke
%Y Pinter, Yuval
%Y Sajjad, Hassan
%S Proceedings of the Fourth BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP
%D 2021
%8 November
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic
%F sinha-etal-2021-perturbing
%X Interpretability methods like Integrated Gradient and LIME are popular choices for explaining natural language model predictions with relative word importance scores. These interpretations need to be robust for trustworthy NLP applications in high-stake areas like medicine or finance. Our paper demonstrates how interpretations can be manipulated by making simple word perturbations on an input text. Via a small portion of word-level swaps, these adversarial perturbations aim to make the resulting text semantically and spatially similar to its seed input (therefore sharing similar interpretations). Simultaneously, the generated examples achieve the same prediction label as the seed yet are given a substantially different explanation by the interpretation methods. Our experiments generate fragile interpretations to attack two SOTA interpretation methods, across three popular Transformer models and on three different NLP datasets. We observe that the rank order correlation and top-K intersection score drops by over 20% when less than 10% of words are perturbed on average. Further, rank-order correlation keeps decreasing as more words get perturbed. Furthermore, we demonstrate that candidates generated from our method have good quality metrics.
%R 10.18653/v1/2021.blackboxnlp-1.33
%U https://aclanthology.org/2021.blackboxnlp-1.33
%U https://doi.org/10.18653/v1/2021.blackboxnlp-1.33
%P 420-434
Markdown (Informal)
[Perturbing Inputs for Fragile Interpretations in Deep Natural Language Processing](https://aclanthology.org/2021.blackboxnlp-1.33) (Sinha et al., BlackboxNLP 2021)
ACL