@inproceedings{han-etal-2026-experience,
title = "Experience-Driven Multi-Agent Optimization for Black-Box Jailbreak Attacks on Large Language Models",
author = "Han, Zhaoyang and
Liu, Yihe and
Zhang, Kai and
Li, Ping",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1188/",
pages = "23729--23747",
ISBN = "979-8-89176-395-1",
abstract = "The rapid discovery of jailbreak prompts has revealed the alarming fragility of safety alignment in frontier large language models (LLMs). While jailbreak techniques play a critical role in red-teaming and safety evaluation, existing methods exhibit three key limitations: (i) poor transferability across model families, requiring model-specific manual tuning; (ii) heavy reliance on large-scale prompt enumeration or exhaustive search, causing prohibitive query costs and poor scalability; and (iii) high sensitivity to input preprocessing or refusal-oriented fine-tuning, leading to attack failures once the underlying model is updated. To address these, we propose Experience-driven Multi-agent Jailbreak Optimization (EMJO), which couples three collaborating agents (Attacker, Analyzer, and Judge) into a closed-loop ``probe{--}evaluate{--}revise'' process, together with a dynamic experience bank accumulating high-quality successful prompts and reusable strategy patterns across iterations and tasks. This design enables query-efficient and transferable jailbreak optimization under black-box access. Extensive experiments on diverse LLMs demonstrate that EMJO consistently outperforms existing black-box jailbreak baselines, achieving up to 11{\%} absolute improvement in attack success rate while reducing the average query cost by up to 7.9$\times$ across two benchmark datasets. These results indicate that EMJO offers an effective and scalable paradigm for systematic jailbreak discovery."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="han-etal-2026-experience">
<titleInfo>
<title>Experience-Driven Multi-Agent Optimization for Black-Box Jailbreak Attacks on Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhaoyang</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yihe</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ping</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>The rapid discovery of jailbreak prompts has revealed the alarming fragility of safety alignment in frontier large language models (LLMs). While jailbreak techniques play a critical role in red-teaming and safety evaluation, existing methods exhibit three key limitations: (i) poor transferability across model families, requiring model-specific manual tuning; (ii) heavy reliance on large-scale prompt enumeration or exhaustive search, causing prohibitive query costs and poor scalability; and (iii) high sensitivity to input preprocessing or refusal-oriented fine-tuning, leading to attack failures once the underlying model is updated. To address these, we propose Experience-driven Multi-agent Jailbreak Optimization (EMJO), which couples three collaborating agents (Attacker, Analyzer, and Judge) into a closed-loop “probe–evaluate–revise” process, together with a dynamic experience bank accumulating high-quality successful prompts and reusable strategy patterns across iterations and tasks. This design enables query-efficient and transferable jailbreak optimization under black-box access. Extensive experiments on diverse LLMs demonstrate that EMJO consistently outperforms existing black-box jailbreak baselines, achieving up to 11% absolute improvement in attack success rate while reducing the average query cost by up to 7.9\times across two benchmark datasets. These results indicate that EMJO offers an effective and scalable paradigm for systematic jailbreak discovery.</abstract>
<identifier type="citekey">han-etal-2026-experience</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1188/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>23729</start>
<end>23747</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Experience-Driven Multi-Agent Optimization for Black-Box Jailbreak Attacks on Large Language Models
%A Han, Zhaoyang
%A Liu, Yihe
%A Zhang, Kai
%A Li, Ping
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F han-etal-2026-experience
%X The rapid discovery of jailbreak prompts has revealed the alarming fragility of safety alignment in frontier large language models (LLMs). While jailbreak techniques play a critical role in red-teaming and safety evaluation, existing methods exhibit three key limitations: (i) poor transferability across model families, requiring model-specific manual tuning; (ii) heavy reliance on large-scale prompt enumeration or exhaustive search, causing prohibitive query costs and poor scalability; and (iii) high sensitivity to input preprocessing or refusal-oriented fine-tuning, leading to attack failures once the underlying model is updated. To address these, we propose Experience-driven Multi-agent Jailbreak Optimization (EMJO), which couples three collaborating agents (Attacker, Analyzer, and Judge) into a closed-loop “probe–evaluate–revise” process, together with a dynamic experience bank accumulating high-quality successful prompts and reusable strategy patterns across iterations and tasks. This design enables query-efficient and transferable jailbreak optimization under black-box access. Extensive experiments on diverse LLMs demonstrate that EMJO consistently outperforms existing black-box jailbreak baselines, achieving up to 11% absolute improvement in attack success rate while reducing the average query cost by up to 7.9\times across two benchmark datasets. These results indicate that EMJO offers an effective and scalable paradigm for systematic jailbreak discovery.
%U https://aclanthology.org/2026.findings-acl.1188/
%P 23729-23747
Markdown (Informal)
[Experience-Driven Multi-Agent Optimization for Black-Box Jailbreak Attacks on Large Language Models](https://aclanthology.org/2026.findings-acl.1188/) (Han et al., Findings 2026)
ACL