@inproceedings{gao-etal-2025-shaping,
title = "Shaping the Safety Boundaries: Understanding and Defending Against Jailbreaks in Large Language Models",
author = "Gao, Lang and
Geng, Jiahui and
Zhang, Xiangliang and
Nakov, Preslav and
Chen, Xiuying",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1233/",
doi = "10.18653/v1/2025.acl-long.1233",
pages = "25378--25398",
ISBN = "979-8-89176-251-0",
abstract = "Jailbreaking in Large Language Models (LLMs) is a major security concern as it can deceive LLMs into generating harmful text. However, understanding of how jailbreaking works remains limited, hindering the development of effective defense strategies. To address this issue, we conduct a large-scale analysis of seven different jailbreak methods and identify that disagreements among methods stem from insufficient observation samples.We introduce the concept of a safety boundary and discover that jailbreaks shift harmful activations outside this boundary, where LLMs become less sensitive to harmful information. Our analysis reveals that low and middle layers play a critical role in these shifts, while deeper layers have a lesser impact.Building on these insights, we propose a novel defense mechanism called Activation Boundary Defense (ABD), which adaptively constrains activations within the safety boundary. To enhance its effectiveness, we use Bayesian optimization to selectively apply the defense to the low and middle layers.Experiments on several benchmark datasets demonstrate that ABD achieves an average Defense Success Rate (DSR) of over 98{\%} against various jailbreak attacks, with less than a 2{\%} impact on the model{'}s general capabilities."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gao-etal-2025-shaping">
<titleInfo>
<title>Shaping the Safety Boundaries: Understanding and Defending Against Jailbreaks in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lang</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiahui</namePart>
<namePart type="family">Geng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiangliang</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiuying</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Jailbreaking in Large Language Models (LLMs) is a major security concern as it can deceive LLMs into generating harmful text. However, understanding of how jailbreaking works remains limited, hindering the development of effective defense strategies. To address this issue, we conduct a large-scale analysis of seven different jailbreak methods and identify that disagreements among methods stem from insufficient observation samples.We introduce the concept of a safety boundary and discover that jailbreaks shift harmful activations outside this boundary, where LLMs become less sensitive to harmful information. Our analysis reveals that low and middle layers play a critical role in these shifts, while deeper layers have a lesser impact.Building on these insights, we propose a novel defense mechanism called Activation Boundary Defense (ABD), which adaptively constrains activations within the safety boundary. To enhance its effectiveness, we use Bayesian optimization to selectively apply the defense to the low and middle layers.Experiments on several benchmark datasets demonstrate that ABD achieves an average Defense Success Rate (DSR) of over 98% against various jailbreak attacks, with less than a 2% impact on the model’s general capabilities.</abstract>
<identifier type="citekey">gao-etal-2025-shaping</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1233</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1233/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>25378</start>
<end>25398</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Shaping the Safety Boundaries: Understanding and Defending Against Jailbreaks in Large Language Models
%A Gao, Lang
%A Geng, Jiahui
%A Zhang, Xiangliang
%A Nakov, Preslav
%A Chen, Xiuying
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F gao-etal-2025-shaping
%X Jailbreaking in Large Language Models (LLMs) is a major security concern as it can deceive LLMs into generating harmful text. However, understanding of how jailbreaking works remains limited, hindering the development of effective defense strategies. To address this issue, we conduct a large-scale analysis of seven different jailbreak methods and identify that disagreements among methods stem from insufficient observation samples.We introduce the concept of a safety boundary and discover that jailbreaks shift harmful activations outside this boundary, where LLMs become less sensitive to harmful information. Our analysis reveals that low and middle layers play a critical role in these shifts, while deeper layers have a lesser impact.Building on these insights, we propose a novel defense mechanism called Activation Boundary Defense (ABD), which adaptively constrains activations within the safety boundary. To enhance its effectiveness, we use Bayesian optimization to selectively apply the defense to the low and middle layers.Experiments on several benchmark datasets demonstrate that ABD achieves an average Defense Success Rate (DSR) of over 98% against various jailbreak attacks, with less than a 2% impact on the model’s general capabilities.
%R 10.18653/v1/2025.acl-long.1233
%U https://aclanthology.org/2025.acl-long.1233/
%U https://doi.org/10.18653/v1/2025.acl-long.1233
%P 25378-25398
Markdown (Informal)
[Shaping the Safety Boundaries: Understanding and Defending Against Jailbreaks in Large Language Models](https://aclanthology.org/2025.acl-long.1233/) (Gao et al., ACL 2025)
ACL