@inproceedings{hu-etal-2026-jailbreak,
title = "Jailbreak-Zero: A Path to {P}areto Optimal Red Teaming for Large Language Models",
author = "Hu, Kai and
Aggarwal, Abhinav and
Khodabandeh, Mehran and
Zhang, David and
Hsin, Eric and
Chen, Li and
Jain, Ankit and
Fredrikson, Matt and
Bharadwaj, Akash",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.2167/",
pages = "46720--46746",
ISBN = "979-8-89176-390-6",
abstract = "This paper presents a novel Automated Red Teaming (ART) framework that shifts from example-based to policy-based evaluation, addressing critical limitations in scalability and validity. We define harmful content through abstract safety policies rather than specific static examples. We also introduce multiple evaluation objectives: risk coverage, semantic diversity, and fidelity, and discover Pareto trade-offs between them. We propose Jailbreak-Zero, a black-box method capable of both zero-shot generation and fine-tuned exploitation of a victim{'}s vulnerabilities to achieve Pareto optimality. Unlike prior approaches, it does not require expert-designed strategies/prompts, but still achieves superior, human-readable attacks against open-source and proprietary models (attack success rates of 99.5{\%} against GPT-4o and 96.0{\%} against Claude 3.5), even for unseen safety policies. It retains efficacy even after victim models undergo safety alignment, and exposes controls to navigate Pareto trade-offs \textit{without} retraining. Lastly, we show that Jailbreak-Zero is the best-performing ART method at a given compute budget. Code is available at: https://github.com/hukkai/jailbreak-zero/ ."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hu-etal-2026-jailbreak">
<titleInfo>
<title>Jailbreak-Zero: A Path to Pareto Optimal Red Teaming for Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhinav</namePart>
<namePart type="family">Aggarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehran</namePart>
<namePart type="family">Khodabandeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Hsin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ankit</namePart>
<namePart type="family">Jain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matt</namePart>
<namePart type="family">Fredrikson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akash</namePart>
<namePart type="family">Bharadwaj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>This paper presents a novel Automated Red Teaming (ART) framework that shifts from example-based to policy-based evaluation, addressing critical limitations in scalability and validity. We define harmful content through abstract safety policies rather than specific static examples. We also introduce multiple evaluation objectives: risk coverage, semantic diversity, and fidelity, and discover Pareto trade-offs between them. We propose Jailbreak-Zero, a black-box method capable of both zero-shot generation and fine-tuned exploitation of a victim’s vulnerabilities to achieve Pareto optimality. Unlike prior approaches, it does not require expert-designed strategies/prompts, but still achieves superior, human-readable attacks against open-source and proprietary models (attack success rates of 99.5% against GPT-4o and 96.0% against Claude 3.5), even for unseen safety policies. It retains efficacy even after victim models undergo safety alignment, and exposes controls to navigate Pareto trade-offs without retraining. Lastly, we show that Jailbreak-Zero is the best-performing ART method at a given compute budget. Code is available at: https://github.com/hukkai/jailbreak-zero/ .</abstract>
<identifier type="citekey">hu-etal-2026-jailbreak</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.2167/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>46720</start>
<end>46746</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Jailbreak-Zero: A Path to Pareto Optimal Red Teaming for Large Language Models
%A Hu, Kai
%A Aggarwal, Abhinav
%A Khodabandeh, Mehran
%A Zhang, David
%A Hsin, Eric
%A Chen, Li
%A Jain, Ankit
%A Fredrikson, Matt
%A Bharadwaj, Akash
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F hu-etal-2026-jailbreak
%X This paper presents a novel Automated Red Teaming (ART) framework that shifts from example-based to policy-based evaluation, addressing critical limitations in scalability and validity. We define harmful content through abstract safety policies rather than specific static examples. We also introduce multiple evaluation objectives: risk coverage, semantic diversity, and fidelity, and discover Pareto trade-offs between them. We propose Jailbreak-Zero, a black-box method capable of both zero-shot generation and fine-tuned exploitation of a victim’s vulnerabilities to achieve Pareto optimality. Unlike prior approaches, it does not require expert-designed strategies/prompts, but still achieves superior, human-readable attacks against open-source and proprietary models (attack success rates of 99.5% against GPT-4o and 96.0% against Claude 3.5), even for unseen safety policies. It retains efficacy even after victim models undergo safety alignment, and exposes controls to navigate Pareto trade-offs without retraining. Lastly, we show that Jailbreak-Zero is the best-performing ART method at a given compute budget. Code is available at: https://github.com/hukkai/jailbreak-zero/ .
%U https://aclanthology.org/2026.acl-long.2167/
%P 46720-46746
Markdown (Informal)
[Jailbreak-Zero: A Path to Pareto Optimal Red Teaming for Large Language Models](https://aclanthology.org/2026.acl-long.2167/) (Hu et al., ACL 2026)
ACL
- Kai Hu, Abhinav Aggarwal, Mehran Khodabandeh, David Zhang, Eric Hsin, Li Chen, Ankit Jain, Matt Fredrikson, and Akash Bharadwaj. 2026. Jailbreak-Zero: A Path to Pareto Optimal Red Teaming for Large Language Models. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 46720–46746, San Diego, California, United States. Association for Computational Linguistics.