@inproceedings{nizar-kobren-2020-leveraging,
title = "Leveraging Extracted Model Adversaries for Improved Black Box Attacks",
author = "Nizar, Naveen Jafer and
Kobren, Ari",
editor = "Alishahi, Afra and
Belinkov, Yonatan and
Chrupa{\l}a, Grzegorz and
Hupkes, Dieuwke and
Pinter, Yuval and
Sajjad, Hassan",
booktitle = "Proceedings of the Third BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.blackboxnlp-1.6",
doi = "10.18653/v1/2020.blackboxnlp-1.6",
pages = "57--67",
abstract = "We present a method for adversarial input generation against black box models for reading comprehension based question answering. Our approach is composed of two steps. First, we approximate a victim black box model via model extraction. Second, we use our own white box method to generate input perturbations that cause the approximate model to fail. These perturbed inputs are used against the victim. In experiments we find that our method improves on the efficacy of the ADDANY{---}a white box attack{---}performed on the approximate model by 25{\%} F1, and the ADDSENT attack{---}a black box attack{---}by 11{\%} F1.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nizar-kobren-2020-leveraging">
<titleInfo>
<title>Leveraging Extracted Model Adversaries for Improved Black Box Attacks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Naveen</namePart>
<namePart type="given">Jafer</namePart>
<namePart type="family">Nizar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ari</namePart>
<namePart type="family">Kobren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Afra</namePart>
<namePart type="family">Alishahi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Belinkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Grzegorz</namePart>
<namePart type="family">Chrupała</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dieuwke</namePart>
<namePart type="family">Hupkes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuval</namePart>
<namePart type="family">Pinter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hassan</namePart>
<namePart type="family">Sajjad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a method for adversarial input generation against black box models for reading comprehension based question answering. Our approach is composed of two steps. First, we approximate a victim black box model via model extraction. Second, we use our own white box method to generate input perturbations that cause the approximate model to fail. These perturbed inputs are used against the victim. In experiments we find that our method improves on the efficacy of the ADDANY—a white box attack—performed on the approximate model by 25% F1, and the ADDSENT attack—a black box attack—by 11% F1.</abstract>
<identifier type="citekey">nizar-kobren-2020-leveraging</identifier>
<identifier type="doi">10.18653/v1/2020.blackboxnlp-1.6</identifier>
<location>
<url>https://aclanthology.org/2020.blackboxnlp-1.6</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>57</start>
<end>67</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Leveraging Extracted Model Adversaries for Improved Black Box Attacks
%A Nizar, Naveen Jafer
%A Kobren, Ari
%Y Alishahi, Afra
%Y Belinkov, Yonatan
%Y Chrupała, Grzegorz
%Y Hupkes, Dieuwke
%Y Pinter, Yuval
%Y Sajjad, Hassan
%S Proceedings of the Third BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F nizar-kobren-2020-leveraging
%X We present a method for adversarial input generation against black box models for reading comprehension based question answering. Our approach is composed of two steps. First, we approximate a victim black box model via model extraction. Second, we use our own white box method to generate input perturbations that cause the approximate model to fail. These perturbed inputs are used against the victim. In experiments we find that our method improves on the efficacy of the ADDANY—a white box attack—performed on the approximate model by 25% F1, and the ADDSENT attack—a black box attack—by 11% F1.
%R 10.18653/v1/2020.blackboxnlp-1.6
%U https://aclanthology.org/2020.blackboxnlp-1.6
%U https://doi.org/10.18653/v1/2020.blackboxnlp-1.6
%P 57-67
Markdown (Informal)
[Leveraging Extracted Model Adversaries for Improved Black Box Attacks](https://aclanthology.org/2020.blackboxnlp-1.6) (Nizar & Kobren, BlackboxNLP 2020)
ACL