@inproceedings{zhang-etal-2023-learning,
title = "Learning to Ignore Adversarial Attacks",
author = "Zhang, Yiming and
Zhou, Yangqiaoyu and
Carton, Samuel and
Tan, Chenhao",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.eacl-main.216",
doi = "10.18653/v1/2023.eacl-main.216",
pages = "2970--2984",
abstract = "Despite the strong performance of current NLP models, they can be brittle against adversarial attacks. To enable effective learning against adversarial inputs, we introduce the use of rationale models that can explicitly learn to ignore attack tokens. We find that the rationale models can successfully ignore over 90{\%} of attack tokens. This approach leads to consistent sizable improvements ({\textasciitilde}10{\%}) over baseline models in robustness on three datasets for both BERT and RoBERTa, and also reliably outperforms data augmentation with adversarial examples alone. In many cases, we find that our method is able to close the gap between model performance on a clean test set and an attacked test set and hence reduce the effect of adversarial attacks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2023-learning">
<titleInfo>
<title>Learning to Ignore Adversarial Attacks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yiming</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yangqiaoyu</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Carton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenhao</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Despite the strong performance of current NLP models, they can be brittle against adversarial attacks. To enable effective learning against adversarial inputs, we introduce the use of rationale models that can explicitly learn to ignore attack tokens. We find that the rationale models can successfully ignore over 90% of attack tokens. This approach leads to consistent sizable improvements (~10%) over baseline models in robustness on three datasets for both BERT and RoBERTa, and also reliably outperforms data augmentation with adversarial examples alone. In many cases, we find that our method is able to close the gap between model performance on a clean test set and an attacked test set and hence reduce the effect of adversarial attacks.</abstract>
<identifier type="citekey">zhang-etal-2023-learning</identifier>
<identifier type="doi">10.18653/v1/2023.eacl-main.216</identifier>
<location>
<url>https://aclanthology.org/2023.eacl-main.216</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>2970</start>
<end>2984</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning to Ignore Adversarial Attacks
%A Zhang, Yiming
%A Zhou, Yangqiaoyu
%A Carton, Samuel
%A Tan, Chenhao
%Y Vlachos, Andreas
%Y Augenstein, Isabelle
%S Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F zhang-etal-2023-learning
%X Despite the strong performance of current NLP models, they can be brittle against adversarial attacks. To enable effective learning against adversarial inputs, we introduce the use of rationale models that can explicitly learn to ignore attack tokens. We find that the rationale models can successfully ignore over 90% of attack tokens. This approach leads to consistent sizable improvements (~10%) over baseline models in robustness on three datasets for both BERT and RoBERTa, and also reliably outperforms data augmentation with adversarial examples alone. In many cases, we find that our method is able to close the gap between model performance on a clean test set and an attacked test set and hence reduce the effect of adversarial attacks.
%R 10.18653/v1/2023.eacl-main.216
%U https://aclanthology.org/2023.eacl-main.216
%U https://doi.org/10.18653/v1/2023.eacl-main.216
%P 2970-2984
Markdown (Informal)
[Learning to Ignore Adversarial Attacks](https://aclanthology.org/2023.eacl-main.216) (Zhang et al., EACL 2023)
ACL
- Yiming Zhang, Yangqiaoyu Zhou, Samuel Carton, and Chenhao Tan. 2023. Learning to Ignore Adversarial Attacks. In Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics, pages 2970–2984, Dubrovnik, Croatia. Association for Computational Linguistics.