@inproceedings{pu-etal-2024-baitattack,
title = "{B}ait{A}ttack: Alleviating Intention Shift in Jailbreak Attacks via Adaptive Bait Crafting",
author = "Pu, Rui and
Li, Chaozhuo and
Ha, Rui and
Zhang, Litian and
Qiu, Lirong and
Zhang, Xi",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.877",
pages = "15654--15668",
abstract = "Jailbreak attacks enable malicious queries to evade detection by LLMs. Existing attacks focus on meticulously constructing prompts to disguise harmful intentions. However, the incorporation of sophisticated disguising prompts may incur the challenge of {``}intention shift{''}. Intention shift occurs when the additional semantics within the prompt distract the LLMs, causing the responses to deviate significantly from the original harmful intentions. In this paper, we propose a novel component, {``}bait{''}, to alleviate the effects of intention shift. Bait comprises an initial response to the harmful query, prompting LLMs to rectify or supplement the knowledge within the bait. By furnishing rich semantics relevant to the query, the bait helps LLMs focus on the original intention. To conceal the harmful content within the bait, we further propose a novel attack paradigm, BaitAttack. BaitAttack adaptively generates necessary components to persuade targeted LLMs that they are engaging with a legitimate inquiry in a safe context. Our proposal is evaluated on a popular dataset, demonstrating state-of-the-art attack performance and an exceptional capability for mitigating intention shift. The implementation of BaitAttack is accessible at: https://anonymous.4open.science/r/BaitAttack-D1F5.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pu-etal-2024-baitattack">
<titleInfo>
<title>BaitAttack: Alleviating Intention Shift in Jailbreak Attacks via Adaptive Bait Crafting</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Pu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chaozhuo</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Ha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Litian</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lirong</namePart>
<namePart type="family">Qiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Jailbreak attacks enable malicious queries to evade detection by LLMs. Existing attacks focus on meticulously constructing prompts to disguise harmful intentions. However, the incorporation of sophisticated disguising prompts may incur the challenge of “intention shift”. Intention shift occurs when the additional semantics within the prompt distract the LLMs, causing the responses to deviate significantly from the original harmful intentions. In this paper, we propose a novel component, “bait”, to alleviate the effects of intention shift. Bait comprises an initial response to the harmful query, prompting LLMs to rectify or supplement the knowledge within the bait. By furnishing rich semantics relevant to the query, the bait helps LLMs focus on the original intention. To conceal the harmful content within the bait, we further propose a novel attack paradigm, BaitAttack. BaitAttack adaptively generates necessary components to persuade targeted LLMs that they are engaging with a legitimate inquiry in a safe context. Our proposal is evaluated on a popular dataset, demonstrating state-of-the-art attack performance and an exceptional capability for mitigating intention shift. The implementation of BaitAttack is accessible at: https://anonymous.4open.science/r/BaitAttack-D1F5.</abstract>
<identifier type="citekey">pu-etal-2024-baitattack</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.877</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>15654</start>
<end>15668</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BaitAttack: Alleviating Intention Shift in Jailbreak Attacks via Adaptive Bait Crafting
%A Pu, Rui
%A Li, Chaozhuo
%A Ha, Rui
%A Zhang, Litian
%A Qiu, Lirong
%A Zhang, Xi
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F pu-etal-2024-baitattack
%X Jailbreak attacks enable malicious queries to evade detection by LLMs. Existing attacks focus on meticulously constructing prompts to disguise harmful intentions. However, the incorporation of sophisticated disguising prompts may incur the challenge of “intention shift”. Intention shift occurs when the additional semantics within the prompt distract the LLMs, causing the responses to deviate significantly from the original harmful intentions. In this paper, we propose a novel component, “bait”, to alleviate the effects of intention shift. Bait comprises an initial response to the harmful query, prompting LLMs to rectify or supplement the knowledge within the bait. By furnishing rich semantics relevant to the query, the bait helps LLMs focus on the original intention. To conceal the harmful content within the bait, we further propose a novel attack paradigm, BaitAttack. BaitAttack adaptively generates necessary components to persuade targeted LLMs that they are engaging with a legitimate inquiry in a safe context. Our proposal is evaluated on a popular dataset, demonstrating state-of-the-art attack performance and an exceptional capability for mitigating intention shift. The implementation of BaitAttack is accessible at: https://anonymous.4open.science/r/BaitAttack-D1F5.
%U https://aclanthology.org/2024.emnlp-main.877
%P 15654-15668
Markdown (Informal)
[BaitAttack: Alleviating Intention Shift in Jailbreak Attacks via Adaptive Bait Crafting](https://aclanthology.org/2024.emnlp-main.877) (Pu et al., EMNLP 2024)
ACL