@inproceedings{wu-etal-2025-sugar,
title = "Sugar-Coated Poison: Benign Generation Unlocks Jailbreaking",
author = "Wu, Yuhang and
Xiong, Yu-Jie and
Zhang, Hao and
Zhang, Jia-Chen and
Zhou, Zheng",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.512/",
pages = "9645--9665",
ISBN = "979-8-89176-335-7",
abstract = "With the increasingly deep integration of large language models (LLMs) across diverse domains, the effectiveness of their safety mechanisms is encountering severe challenges. Currently, jailbreak attacks based on prompt engineering, which induce models to generate potentially harmful content, have become a major security threat. However, existing methods primarily rely on black-box manipulation of prompt templates, resulting in high costs and poor generalizability. To break through the bottleneck, this study reveals the potential impact of the generation of LLMs on safety for the first time that Defense Threshold Decay (DTD) phenomena: as benign content generation increases, the model{'}s attention to input instructions progressively diminishes. Building on this insight, we propose the Sugar-Coated Poison (SCP) attack paradigm, using a ``semantic reversal'' strategy, where benign inputs that are opposite in meaning to malicious intent are crafted to induce the model into a safety response mode. When the defense threshold decays, an adversarial reasoning mechanism easily bypasses safety mechanisms. Experiments show SCP outperforms existing baselines. For defense, we propose Part-of-Speech Defense (POSD), leveraging verb-noun dependencies for syntactic analysis to enhance robustness and security of LLMs. Our code is available at https://anonymous.4open.science/r/SCP-9092."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wu-etal-2025-sugar">
<titleInfo>
<title>Sugar-Coated Poison: Benign Generation Unlocks Jailbreaking</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuhang</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu-Jie</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jia-Chen</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>With the increasingly deep integration of large language models (LLMs) across diverse domains, the effectiveness of their safety mechanisms is encountering severe challenges. Currently, jailbreak attacks based on prompt engineering, which induce models to generate potentially harmful content, have become a major security threat. However, existing methods primarily rely on black-box manipulation of prompt templates, resulting in high costs and poor generalizability. To break through the bottleneck, this study reveals the potential impact of the generation of LLMs on safety for the first time that Defense Threshold Decay (DTD) phenomena: as benign content generation increases, the model’s attention to input instructions progressively diminishes. Building on this insight, we propose the Sugar-Coated Poison (SCP) attack paradigm, using a “semantic reversal” strategy, where benign inputs that are opposite in meaning to malicious intent are crafted to induce the model into a safety response mode. When the defense threshold decays, an adversarial reasoning mechanism easily bypasses safety mechanisms. Experiments show SCP outperforms existing baselines. For defense, we propose Part-of-Speech Defense (POSD), leveraging verb-noun dependencies for syntactic analysis to enhance robustness and security of LLMs. Our code is available at https://anonymous.4open.science/r/SCP-9092.</abstract>
<identifier type="citekey">wu-etal-2025-sugar</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.512/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>9645</start>
<end>9665</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Sugar-Coated Poison: Benign Generation Unlocks Jailbreaking
%A Wu, Yuhang
%A Xiong, Yu-Jie
%A Zhang, Hao
%A Zhang, Jia-Chen
%A Zhou, Zheng
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F wu-etal-2025-sugar
%X With the increasingly deep integration of large language models (LLMs) across diverse domains, the effectiveness of their safety mechanisms is encountering severe challenges. Currently, jailbreak attacks based on prompt engineering, which induce models to generate potentially harmful content, have become a major security threat. However, existing methods primarily rely on black-box manipulation of prompt templates, resulting in high costs and poor generalizability. To break through the bottleneck, this study reveals the potential impact of the generation of LLMs on safety for the first time that Defense Threshold Decay (DTD) phenomena: as benign content generation increases, the model’s attention to input instructions progressively diminishes. Building on this insight, we propose the Sugar-Coated Poison (SCP) attack paradigm, using a “semantic reversal” strategy, where benign inputs that are opposite in meaning to malicious intent are crafted to induce the model into a safety response mode. When the defense threshold decays, an adversarial reasoning mechanism easily bypasses safety mechanisms. Experiments show SCP outperforms existing baselines. For defense, we propose Part-of-Speech Defense (POSD), leveraging verb-noun dependencies for syntactic analysis to enhance robustness and security of LLMs. Our code is available at https://anonymous.4open.science/r/SCP-9092.
%U https://aclanthology.org/2025.findings-emnlp.512/
%P 9645-9665
Markdown (Informal)
[Sugar-Coated Poison: Benign Generation Unlocks Jailbreaking](https://aclanthology.org/2025.findings-emnlp.512/) (Wu et al., Findings 2025)
ACL