@inproceedings{jieyu-etal-2023-training,
title = "Training {NLI} Models Through Universal Adversarial Attack",
author = "Jieyu, Lin and
Wei, Liu and
Jiajie, Zou and
Nai, Ding",
editor = "Sun, Maosong and
Qin, Bing and
Qiu, Xipeng and
Jiang, Jing and
Han, Xianpei",
booktitle = "Proceedings of the 22nd Chinese National Conference on Computational Linguistics",
month = aug,
year = "2023",
address = "Harbin, China",
publisher = "Chinese Information Processing Society of China",
url = "https://aclanthology.org/2023.ccl-1.72",
pages = "847--861",
abstract = "{``}Pre-trained language models are sensitive to adversarial attacks, and recent works have demon-strated universal adversarial attacks that can apply input-agnostic perturbations to mislead mod-els. Here, we demonstrate that universal adversarial attacks can also be used to harden NLPmodels. Based on NLI task, we propose a simple universal adversarial attack that can misleadmodels to produce the same output for all premises by replacing the original hypothesis with anirrelevant string of words. To defend against this attack, we propose Training with UNiversalAdversarial Samples (TUNAS), which iteratively generates universal adversarial samples andutilizes them for fine-tuning. The method is tested on two datasets, i.e., MNLI and SNLI. It isdemonstrated that, TUNAS can reduce the mean success rate of the universal adversarial attackfrom above 79{\%} to below 5{\%}, while maintaining similar performance on the original datasets. Furthermore, TUNAS models are also more robust to the attack targeting at individual samples:When search for hypotheses that are best entailed by a premise, the hypotheses found by TUNASmodels are more compatible with the premise than those found by baseline models. In sum, weuse universal adversarial attack to yield more robust models. Introduction{''}",
language = "English",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jieyu-etal-2023-training">
<titleInfo>
<title>Training NLI Models Through Universal Adversarial Attack</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lin</namePart>
<namePart type="family">Jieyu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liu</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zou</namePart>
<namePart type="family">Jiajie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ding</namePart>
<namePart type="family">Nai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd Chinese National Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maosong</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bing</namePart>
<namePart type="family">Qin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xipeng</namePart>
<namePart type="family">Qiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xianpei</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Chinese Information Processing Society of China</publisher>
<place>
<placeTerm type="text">Harbin, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>“Pre-trained language models are sensitive to adversarial attacks, and recent works have demon-strated universal adversarial attacks that can apply input-agnostic perturbations to mislead mod-els. Here, we demonstrate that universal adversarial attacks can also be used to harden NLPmodels. Based on NLI task, we propose a simple universal adversarial attack that can misleadmodels to produce the same output for all premises by replacing the original hypothesis with anirrelevant string of words. To defend against this attack, we propose Training with UNiversalAdversarial Samples (TUNAS), which iteratively generates universal adversarial samples andutilizes them for fine-tuning. The method is tested on two datasets, i.e., MNLI and SNLI. It isdemonstrated that, TUNAS can reduce the mean success rate of the universal adversarial attackfrom above 79% to below 5%, while maintaining similar performance on the original datasets. Furthermore, TUNAS models are also more robust to the attack targeting at individual samples:When search for hypotheses that are best entailed by a premise, the hypotheses found by TUNASmodels are more compatible with the premise than those found by baseline models. In sum, weuse universal adversarial attack to yield more robust models. Introduction”</abstract>
<identifier type="citekey">jieyu-etal-2023-training</identifier>
<location>
<url>https://aclanthology.org/2023.ccl-1.72</url>
</location>
<part>
<date>2023-08</date>
<extent unit="page">
<start>847</start>
<end>861</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Training NLI Models Through Universal Adversarial Attack
%A Jieyu, Lin
%A Wei, Liu
%A Jiajie, Zou
%A Nai, Ding
%Y Sun, Maosong
%Y Qin, Bing
%Y Qiu, Xipeng
%Y Jiang, Jing
%Y Han, Xianpei
%S Proceedings of the 22nd Chinese National Conference on Computational Linguistics
%D 2023
%8 August
%I Chinese Information Processing Society of China
%C Harbin, China
%G English
%F jieyu-etal-2023-training
%X “Pre-trained language models are sensitive to adversarial attacks, and recent works have demon-strated universal adversarial attacks that can apply input-agnostic perturbations to mislead mod-els. Here, we demonstrate that universal adversarial attacks can also be used to harden NLPmodels. Based on NLI task, we propose a simple universal adversarial attack that can misleadmodels to produce the same output for all premises by replacing the original hypothesis with anirrelevant string of words. To defend against this attack, we propose Training with UNiversalAdversarial Samples (TUNAS), which iteratively generates universal adversarial samples andutilizes them for fine-tuning. The method is tested on two datasets, i.e., MNLI and SNLI. It isdemonstrated that, TUNAS can reduce the mean success rate of the universal adversarial attackfrom above 79% to below 5%, while maintaining similar performance on the original datasets. Furthermore, TUNAS models are also more robust to the attack targeting at individual samples:When search for hypotheses that are best entailed by a premise, the hypotheses found by TUNASmodels are more compatible with the premise than those found by baseline models. In sum, weuse universal adversarial attack to yield more robust models. Introduction”
%U https://aclanthology.org/2023.ccl-1.72
%P 847-861
Markdown (Informal)
[Training NLI Models Through Universal Adversarial Attack](https://aclanthology.org/2023.ccl-1.72) (Jieyu et al., CCL 2023)
ACL