@inproceedings{wang-etal-2025-vulnerability,
title = "Vulnerability of Large Language Models to Output Prefix Jailbreaks: Impact of Positions on Safety",
author = "Wang, Yiwei and
Chen, Muhao and
Peng, Nanyun and
Chang, Kai-Wei",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.219/",
doi = "10.18653/v1/2025.findings-naacl.219",
pages = "3939--3952",
ISBN = "979-8-89176-195-7",
abstract = "Previous research on jailbreak attacks has mainly focused on optimizing the adversarial snippet content injected into input prompts to expose LLM security vulnerabilities. A significant portion of this research focuses on developing more complex, less readable adversarial snippets that can achieve higher attack success rates. In contrast to this trend, our research investigates the impact of the adversarial snippet{'}s position on the effectiveness of jailbreak attacks. We find that placing a simple and readable adversarial snippet at the beginning of the output effectively exposes LLM safety vulnerabilities, leading to much higher attack success rates than the input suffix attack or prompt-based output jailbreaks. Precisely speaking, we discover that directly enforcing the user{'}s target embedded output prefix is an effective method to expose LLMs' safety vulnerabilities."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2025-vulnerability">
<titleInfo>
<title>Vulnerability of Large Language Models to Output Prefix Jailbreaks: Impact of Positions on Safety</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yiwei</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhao</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nanyun</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>Previous research on jailbreak attacks has mainly focused on optimizing the adversarial snippet content injected into input prompts to expose LLM security vulnerabilities. A significant portion of this research focuses on developing more complex, less readable adversarial snippets that can achieve higher attack success rates. In contrast to this trend, our research investigates the impact of the adversarial snippet’s position on the effectiveness of jailbreak attacks. We find that placing a simple and readable adversarial snippet at the beginning of the output effectively exposes LLM safety vulnerabilities, leading to much higher attack success rates than the input suffix attack or prompt-based output jailbreaks. Precisely speaking, we discover that directly enforcing the user’s target embedded output prefix is an effective method to expose LLMs’ safety vulnerabilities.</abstract>
<identifier type="citekey">wang-etal-2025-vulnerability</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.219</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.219/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>3939</start>
<end>3952</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Vulnerability of Large Language Models to Output Prefix Jailbreaks: Impact of Positions on Safety
%A Wang, Yiwei
%A Chen, Muhao
%A Peng, Nanyun
%A Chang, Kai-Wei
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F wang-etal-2025-vulnerability
%X Previous research on jailbreak attacks has mainly focused on optimizing the adversarial snippet content injected into input prompts to expose LLM security vulnerabilities. A significant portion of this research focuses on developing more complex, less readable adversarial snippets that can achieve higher attack success rates. In contrast to this trend, our research investigates the impact of the adversarial snippet’s position on the effectiveness of jailbreak attacks. We find that placing a simple and readable adversarial snippet at the beginning of the output effectively exposes LLM safety vulnerabilities, leading to much higher attack success rates than the input suffix attack or prompt-based output jailbreaks. Precisely speaking, we discover that directly enforcing the user’s target embedded output prefix is an effective method to expose LLMs’ safety vulnerabilities.
%R 10.18653/v1/2025.findings-naacl.219
%U https://aclanthology.org/2025.findings-naacl.219/
%U https://doi.org/10.18653/v1/2025.findings-naacl.219
%P 3939-3952
Markdown (Informal)
[Vulnerability of Large Language Models to Output Prefix Jailbreaks: Impact of Positions on Safety](https://aclanthology.org/2025.findings-naacl.219/) (Wang et al., Findings 2025)
ACL