@inproceedings{chu-etal-2025-jailbreakradar,
title = "{J}ailbreak{R}adar: Comprehensive Assessment of Jailbreak Attacks Against {LLM}s",
author = "Chu, Junjie and
Liu, Yugeng and
Yang, Ziqing and
Shen, Xinyue and
Backes, Michael and
Zhang, Yang",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1045/",
doi = "10.18653/v1/2025.acl-long.1045",
pages = "21538--21566",
ISBN = "979-8-89176-251-0",
abstract = "Jailbreak attacks aim to bypass the LLMs' safeguards. While researchers have proposed different jailbreak attacks in depth, they have done so in isolation{---}either with unaligned settings or comparing a limited range of methods. To fill this gap, we present a large-scale evaluation of various jailbreak attacks. We collect 17 representative jailbreak attacks, summarize their features, and establish a novel jailbreak attack taxonomy. Then we conduct comprehensive measurement and ablation studies across nine aligned LLMs on 160 forbidden questions from 16 violation categories. Also, we test jailbreak attacks under eight advanced defenses. Based on our taxonomy and experiments, we identify some important patterns, such as heuristic-based attacks, which could achieve high attack success rates but are easy to mitigate by defenses. Our study offers valuable insights for future research on jailbreak attacks and defenses and serves as a benchmark tool for researchers and practitioners to evaluate them effectively."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chu-etal-2025-jailbreakradar">
<titleInfo>
<title>JailbreakRadar: Comprehensive Assessment of Jailbreak Attacks Against LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Junjie</namePart>
<namePart type="family">Chu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yugeng</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziqing</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinyue</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Backes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Jailbreak attacks aim to bypass the LLMs’ safeguards. While researchers have proposed different jailbreak attacks in depth, they have done so in isolation—either with unaligned settings or comparing a limited range of methods. To fill this gap, we present a large-scale evaluation of various jailbreak attacks. We collect 17 representative jailbreak attacks, summarize their features, and establish a novel jailbreak attack taxonomy. Then we conduct comprehensive measurement and ablation studies across nine aligned LLMs on 160 forbidden questions from 16 violation categories. Also, we test jailbreak attacks under eight advanced defenses. Based on our taxonomy and experiments, we identify some important patterns, such as heuristic-based attacks, which could achieve high attack success rates but are easy to mitigate by defenses. Our study offers valuable insights for future research on jailbreak attacks and defenses and serves as a benchmark tool for researchers and practitioners to evaluate them effectively.</abstract>
<identifier type="citekey">chu-etal-2025-jailbreakradar</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1045</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1045/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>21538</start>
<end>21566</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T JailbreakRadar: Comprehensive Assessment of Jailbreak Attacks Against LLMs
%A Chu, Junjie
%A Liu, Yugeng
%A Yang, Ziqing
%A Shen, Xinyue
%A Backes, Michael
%A Zhang, Yang
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F chu-etal-2025-jailbreakradar
%X Jailbreak attacks aim to bypass the LLMs’ safeguards. While researchers have proposed different jailbreak attacks in depth, they have done so in isolation—either with unaligned settings or comparing a limited range of methods. To fill this gap, we present a large-scale evaluation of various jailbreak attacks. We collect 17 representative jailbreak attacks, summarize their features, and establish a novel jailbreak attack taxonomy. Then we conduct comprehensive measurement and ablation studies across nine aligned LLMs on 160 forbidden questions from 16 violation categories. Also, we test jailbreak attacks under eight advanced defenses. Based on our taxonomy and experiments, we identify some important patterns, such as heuristic-based attacks, which could achieve high attack success rates but are easy to mitigate by defenses. Our study offers valuable insights for future research on jailbreak attacks and defenses and serves as a benchmark tool for researchers and practitioners to evaluate them effectively.
%R 10.18653/v1/2025.acl-long.1045
%U https://aclanthology.org/2025.acl-long.1045/
%U https://doi.org/10.18653/v1/2025.acl-long.1045
%P 21538-21566
Markdown (Informal)
[JailbreakRadar: Comprehensive Assessment of Jailbreak Attacks Against LLMs](https://aclanthology.org/2025.acl-long.1045/) (Chu et al., ACL 2025)
ACL