@inproceedings{le-etal-2021-sweet,
title = "A Sweet Rabbit Hole by {DARCY}: Using Honeypots to Detect Universal Trigger{'}s Adversarial Attacks",
author = "Le, Thai and
Park, Noseong and
Lee, Dongwon",
editor = "Zong, Chengqing and
Xia, Fei and
Li, Wenjie and
Navigli, Roberto",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.acl-long.296",
doi = "10.18653/v1/2021.acl-long.296",
pages = "3831--3844",
abstract = "The Universal Trigger (UniTrigger) is a recently-proposed powerful adversarial textual attack method. Utilizing a learning-based mechanism, UniTrigger generates a fixed phrase that, when added to any benign inputs, can drop the prediction accuracy of a textual neural network (NN) model to near zero on a target class. To defend against this attack that can cause significant harm, in this paper, we borrow the {``}honeypot{''} concept from the cybersecurity community and propose DARCY, a honeypot-based defense framework against UniTrigger. DARCY greedily searches and injects multiple trapdoors into an NN model to {``}bait and catch{''} potential attacks. Through comprehensive experiments across four public datasets, we show that DARCY detects UniTrigger{'}s adversarial attacks with up to 99{\%} TPR and less than 2{\%} FPR in most cases, while maintaining the prediction accuracy (in F1) for clean inputs within a 1{\%} margin. We also demonstrate that DARCY with multiple trapdoors is also robust to a diverse set of attack scenarios with attackers{'} varying levels of knowledge and skills. We release the source code of DARCY at: \url{https://github.com/lethaiq/ACL2021-DARCY-HoneypotDefenseNLP}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="le-etal-2021-sweet">
<titleInfo>
<title>A Sweet Rabbit Hole by DARCY: Using Honeypots to Detect Universal Trigger’s Adversarial Attacks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Thai</namePart>
<namePart type="family">Le</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noseong</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongwon</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenjie</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Navigli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The Universal Trigger (UniTrigger) is a recently-proposed powerful adversarial textual attack method. Utilizing a learning-based mechanism, UniTrigger generates a fixed phrase that, when added to any benign inputs, can drop the prediction accuracy of a textual neural network (NN) model to near zero on a target class. To defend against this attack that can cause significant harm, in this paper, we borrow the “honeypot” concept from the cybersecurity community and propose DARCY, a honeypot-based defense framework against UniTrigger. DARCY greedily searches and injects multiple trapdoors into an NN model to “bait and catch” potential attacks. Through comprehensive experiments across four public datasets, we show that DARCY detects UniTrigger’s adversarial attacks with up to 99% TPR and less than 2% FPR in most cases, while maintaining the prediction accuracy (in F1) for clean inputs within a 1% margin. We also demonstrate that DARCY with multiple trapdoors is also robust to a diverse set of attack scenarios with attackers’ varying levels of knowledge and skills. We release the source code of DARCY at: https://github.com/lethaiq/ACL2021-DARCY-HoneypotDefenseNLP.</abstract>
<identifier type="citekey">le-etal-2021-sweet</identifier>
<identifier type="doi">10.18653/v1/2021.acl-long.296</identifier>
<location>
<url>https://aclanthology.org/2021.acl-long.296</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>3831</start>
<end>3844</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Sweet Rabbit Hole by DARCY: Using Honeypots to Detect Universal Trigger’s Adversarial Attacks
%A Le, Thai
%A Park, Noseong
%A Lee, Dongwon
%Y Zong, Chengqing
%Y Xia, Fei
%Y Li, Wenjie
%Y Navigli, Roberto
%S Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F le-etal-2021-sweet
%X The Universal Trigger (UniTrigger) is a recently-proposed powerful adversarial textual attack method. Utilizing a learning-based mechanism, UniTrigger generates a fixed phrase that, when added to any benign inputs, can drop the prediction accuracy of a textual neural network (NN) model to near zero on a target class. To defend against this attack that can cause significant harm, in this paper, we borrow the “honeypot” concept from the cybersecurity community and propose DARCY, a honeypot-based defense framework against UniTrigger. DARCY greedily searches and injects multiple trapdoors into an NN model to “bait and catch” potential attacks. Through comprehensive experiments across four public datasets, we show that DARCY detects UniTrigger’s adversarial attacks with up to 99% TPR and less than 2% FPR in most cases, while maintaining the prediction accuracy (in F1) for clean inputs within a 1% margin. We also demonstrate that DARCY with multiple trapdoors is also robust to a diverse set of attack scenarios with attackers’ varying levels of knowledge and skills. We release the source code of DARCY at: https://github.com/lethaiq/ACL2021-DARCY-HoneypotDefenseNLP.
%R 10.18653/v1/2021.acl-long.296
%U https://aclanthology.org/2021.acl-long.296
%U https://doi.org/10.18653/v1/2021.acl-long.296
%P 3831-3844
Markdown (Informal)
[A Sweet Rabbit Hole by DARCY: Using Honeypots to Detect Universal Trigger’s Adversarial Attacks](https://aclanthology.org/2021.acl-long.296) (Le et al., ACL-IJCNLP 2021)
ACL