@inproceedings{tong-etal-2026-rlshield,
title = "{RLS}hield: Dynamic Jailbreak Detection for {LLM}s via Reinforced Adaptive Learning",
author = "Tong, Zhao and
Yang, Pengfei and
Gu, Yimeng and
Shi, Haichao and
Liu, Qiang and
Xu, Xingcheng and
Wu, Shu and
Zhang, Xiao-Yu",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1182/",
pages = "23615--23630",
ISBN = "979-8-89176-395-1",
abstract = "While prompt engineering enhances the capabilities of Large Language Models (LLMs), it also exposes critical safety concerns. Due to the inherent brittleness of their static safety boundaries, LLMs are vulnerable to jailbreak prompts, i.e. adversarial inputs designed to bypass safeguards and induce the generation of harmful content. Existing detection mechanisms rely on static model components or fixed decision thresholds, limiting their ability to generalize to evolving attack patterns and continual model updates. To bridge this gap, we propose RLShield, a dynamic jailbreak detection framework that employs reinforcement learning for adaptive threshold selection. RLShield incorporates three key innovations: (i) a dynamic retrieval and LLM-based rewriting module to simulate diverse adversarial contexts; (ii) a cross-layer representation analysis to pinpoint safety-critical parameters; and (iii) a Soft Actor-Critic (SAC) based agent that learns to predict optimal, sample-specific detection thresholds. Experimental results demonstrate that RLShield consistently outperforms state-of-the-art baselines in detection performance while maintaining high computational efficiency. Notably, it improves F1 by up to 7.3{\%}, while achieving an average of 3$\times$ gain in inference efficiency across multiple LLM backbones."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tong-etal-2026-rlshield">
<titleInfo>
<title>RLShield: Dynamic Jailbreak Detection for LLMs via Reinforced Adaptive Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhao</namePart>
<namePart type="family">Tong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pengfei</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yimeng</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haichao</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qiang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xingcheng</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shu</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiao-Yu</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>While prompt engineering enhances the capabilities of Large Language Models (LLMs), it also exposes critical safety concerns. Due to the inherent brittleness of their static safety boundaries, LLMs are vulnerable to jailbreak prompts, i.e. adversarial inputs designed to bypass safeguards and induce the generation of harmful content. Existing detection mechanisms rely on static model components or fixed decision thresholds, limiting their ability to generalize to evolving attack patterns and continual model updates. To bridge this gap, we propose RLShield, a dynamic jailbreak detection framework that employs reinforcement learning for adaptive threshold selection. RLShield incorporates three key innovations: (i) a dynamic retrieval and LLM-based rewriting module to simulate diverse adversarial contexts; (ii) a cross-layer representation analysis to pinpoint safety-critical parameters; and (iii) a Soft Actor-Critic (SAC) based agent that learns to predict optimal, sample-specific detection thresholds. Experimental results demonstrate that RLShield consistently outperforms state-of-the-art baselines in detection performance while maintaining high computational efficiency. Notably, it improves F1 by up to 7.3%, while achieving an average of 3\times gain in inference efficiency across multiple LLM backbones.</abstract>
<identifier type="citekey">tong-etal-2026-rlshield</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1182/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>23615</start>
<end>23630</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T RLShield: Dynamic Jailbreak Detection for LLMs via Reinforced Adaptive Learning
%A Tong, Zhao
%A Yang, Pengfei
%A Gu, Yimeng
%A Shi, Haichao
%A Liu, Qiang
%A Xu, Xingcheng
%A Wu, Shu
%A Zhang, Xiao-Yu
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F tong-etal-2026-rlshield
%X While prompt engineering enhances the capabilities of Large Language Models (LLMs), it also exposes critical safety concerns. Due to the inherent brittleness of their static safety boundaries, LLMs are vulnerable to jailbreak prompts, i.e. adversarial inputs designed to bypass safeguards and induce the generation of harmful content. Existing detection mechanisms rely on static model components or fixed decision thresholds, limiting their ability to generalize to evolving attack patterns and continual model updates. To bridge this gap, we propose RLShield, a dynamic jailbreak detection framework that employs reinforcement learning for adaptive threshold selection. RLShield incorporates three key innovations: (i) a dynamic retrieval and LLM-based rewriting module to simulate diverse adversarial contexts; (ii) a cross-layer representation analysis to pinpoint safety-critical parameters; and (iii) a Soft Actor-Critic (SAC) based agent that learns to predict optimal, sample-specific detection thresholds. Experimental results demonstrate that RLShield consistently outperforms state-of-the-art baselines in detection performance while maintaining high computational efficiency. Notably, it improves F1 by up to 7.3%, while achieving an average of 3\times gain in inference efficiency across multiple LLM backbones.
%U https://aclanthology.org/2026.findings-acl.1182/
%P 23615-23630
Markdown (Informal)
[RLShield: Dynamic Jailbreak Detection for LLMs via Reinforced Adaptive Learning](https://aclanthology.org/2026.findings-acl.1182/) (Tong et al., Findings 2026)
ACL
- Zhao Tong, Pengfei Yang, Yimeng Gu, Haichao Shi, Qiang Liu, Xingcheng Xu, Shu Wu, and Xiao-Yu Zhang. 2026. RLShield: Dynamic Jailbreak Detection for LLMs via Reinforced Adaptive Learning. In Findings of the Association for Computational Linguistics: ACL 2026, pages 23615–23630, San Diego, California, United States. Association for Computational Linguistics.