@inproceedings{nguyen-son-etal-2023-votetrans,
title = "{V}ote{TRANS}: Detecting Adversarial Text without Training by Voting on Hard Labels of Transformations",
author = "Nguyen-Son, Hoang-Quoc and
Hidano, Seira and
Fukushima, Kazuhide and
Kiyomoto, Shinsaku and
Echizen, Isao",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.315",
doi = "10.18653/v1/2023.findings-acl.315",
pages = "5090--5104",
abstract = "Adversarial attacks reveal serious flaws in deep learning models. More dangerously, these attacks preserve the original meaning and escape human recognition. Existing methods for detecting these attacks need to be trained using original/adversarial data. In this paper, we propose detection without training by voting on hard labels from predictions of transformations, namely, VoteTRANS. Specifically, VoteTRANS detects adversarial text by comparing the hard labels of input text and its transformation. The evaluation demonstrates that VoteTRANS effectively detects adversarial text across various state-of-the-art attacks, models, and datasets.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nguyen-son-etal-2023-votetrans">
<titleInfo>
<title>VoteTRANS: Detecting Adversarial Text without Training by Voting on Hard Labels of Transformations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hoang-Quoc</namePart>
<namePart type="family">Nguyen-Son</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seira</namePart>
<namePart type="family">Hidano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kazuhide</namePart>
<namePart type="family">Fukushima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shinsaku</namePart>
<namePart type="family">Kiyomoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isao</namePart>
<namePart type="family">Echizen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Adversarial attacks reveal serious flaws in deep learning models. More dangerously, these attacks preserve the original meaning and escape human recognition. Existing methods for detecting these attacks need to be trained using original/adversarial data. In this paper, we propose detection without training by voting on hard labels from predictions of transformations, namely, VoteTRANS. Specifically, VoteTRANS detects adversarial text by comparing the hard labels of input text and its transformation. The evaluation demonstrates that VoteTRANS effectively detects adversarial text across various state-of-the-art attacks, models, and datasets.</abstract>
<identifier type="citekey">nguyen-son-etal-2023-votetrans</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.315</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.315</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>5090</start>
<end>5104</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T VoteTRANS: Detecting Adversarial Text without Training by Voting on Hard Labels of Transformations
%A Nguyen-Son, Hoang-Quoc
%A Hidano, Seira
%A Fukushima, Kazuhide
%A Kiyomoto, Shinsaku
%A Echizen, Isao
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F nguyen-son-etal-2023-votetrans
%X Adversarial attacks reveal serious flaws in deep learning models. More dangerously, these attacks preserve the original meaning and escape human recognition. Existing methods for detecting these attacks need to be trained using original/adversarial data. In this paper, we propose detection without training by voting on hard labels from predictions of transformations, namely, VoteTRANS. Specifically, VoteTRANS detects adversarial text by comparing the hard labels of input text and its transformation. The evaluation demonstrates that VoteTRANS effectively detects adversarial text across various state-of-the-art attacks, models, and datasets.
%R 10.18653/v1/2023.findings-acl.315
%U https://aclanthology.org/2023.findings-acl.315
%U https://doi.org/10.18653/v1/2023.findings-acl.315
%P 5090-5104
Markdown (Informal)
[VoteTRANS: Detecting Adversarial Text without Training by Voting on Hard Labels of Transformations](https://aclanthology.org/2023.findings-acl.315) (Nguyen-Son et al., Findings 2023)
ACL