@inproceedings{jiang-etal-2025-red,
title = "Red Queen: Exposing Latent Multi-Turn Risks in Large Language Models",
author = "Jiang, Yifan and
Aggarwal, Kriti and
Laud, Tanmay and
Munir, Kashif and
Pujara, Jay and
Mukherjee, Subhabrata",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1311/",
doi = "10.18653/v1/2025.findings-acl.1311",
pages = "25554--25591",
ISBN = "979-8-89176-256-5",
abstract = "The rapid advancement of large language models (LLMs) has unlocked diverse opportunities across domains and applications but has also raised concerns about their tendency to generate harmful responses under jailbreak attacks. However, most existing jailbreak strategies are single-turn with explicit malicious intent, failing to reflect the real-world scenario where interactions can be multi-turn and users can conceal their intents. Recent studies on Theory of Mind (ToM) reveal that LLMs often struggle to infer users' latent intent in such scenarios. Building on these limitations, we propose a novel jailbreak attack, RED QUEEN ATTACK, which constructs a multi-turn scenario, concealing the malicious intent under the guise of preventing harm. We generate 56k multi-turn concealment data points across 40 scenarios and 14 harmful categories, evaluating four LLM families of different sizes. Results show all models are vulnerable to RED QUEEN ATTACK, reaching 87.6{\%} attack success rate (ASR) on GPT-4o and 77.1{\%} on Llama3-70B. Compared to prior jailbreak attacks, the RED QUEEN ATTACK achieves superior performance on nine out of ten models, with ASR improvements ranging from 2{\%} to 64{\%}. Further analysis reveals that larger models exhibit greater vulnerability to our attack, primarily due to the combination of multi-turn structures and concealment strategies. To enhance safety, we propose RED QUEEN GUARD, a mitigation strategy reducing ASR to below 1{\%} while maintaining model performance on standard benchmarks. Full implementation and dataset are publicly accessible at https://github.com/kriti-hippo/red{\_}queen."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jiang-etal-2025-red">
<titleInfo>
<title>Red Queen: Exposing Latent Multi-Turn Risks in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yifan</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kriti</namePart>
<namePart type="family">Aggarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmay</namePart>
<namePart type="family">Laud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kashif</namePart>
<namePart type="family">Munir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jay</namePart>
<namePart type="family">Pujara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Subhabrata</namePart>
<namePart type="family">Mukherjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>The rapid advancement of large language models (LLMs) has unlocked diverse opportunities across domains and applications but has also raised concerns about their tendency to generate harmful responses under jailbreak attacks. However, most existing jailbreak strategies are single-turn with explicit malicious intent, failing to reflect the real-world scenario where interactions can be multi-turn and users can conceal their intents. Recent studies on Theory of Mind (ToM) reveal that LLMs often struggle to infer users’ latent intent in such scenarios. Building on these limitations, we propose a novel jailbreak attack, RED QUEEN ATTACK, which constructs a multi-turn scenario, concealing the malicious intent under the guise of preventing harm. We generate 56k multi-turn concealment data points across 40 scenarios and 14 harmful categories, evaluating four LLM families of different sizes. Results show all models are vulnerable to RED QUEEN ATTACK, reaching 87.6% attack success rate (ASR) on GPT-4o and 77.1% on Llama3-70B. Compared to prior jailbreak attacks, the RED QUEEN ATTACK achieves superior performance on nine out of ten models, with ASR improvements ranging from 2% to 64%. Further analysis reveals that larger models exhibit greater vulnerability to our attack, primarily due to the combination of multi-turn structures and concealment strategies. To enhance safety, we propose RED QUEEN GUARD, a mitigation strategy reducing ASR to below 1% while maintaining model performance on standard benchmarks. Full implementation and dataset are publicly accessible at https://github.com/kriti-hippo/red_queen.</abstract>
<identifier type="citekey">jiang-etal-2025-red</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1311</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1311/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>25554</start>
<end>25591</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Red Queen: Exposing Latent Multi-Turn Risks in Large Language Models
%A Jiang, Yifan
%A Aggarwal, Kriti
%A Laud, Tanmay
%A Munir, Kashif
%A Pujara, Jay
%A Mukherjee, Subhabrata
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F jiang-etal-2025-red
%X The rapid advancement of large language models (LLMs) has unlocked diverse opportunities across domains and applications but has also raised concerns about their tendency to generate harmful responses under jailbreak attacks. However, most existing jailbreak strategies are single-turn with explicit malicious intent, failing to reflect the real-world scenario where interactions can be multi-turn and users can conceal their intents. Recent studies on Theory of Mind (ToM) reveal that LLMs often struggle to infer users’ latent intent in such scenarios. Building on these limitations, we propose a novel jailbreak attack, RED QUEEN ATTACK, which constructs a multi-turn scenario, concealing the malicious intent under the guise of preventing harm. We generate 56k multi-turn concealment data points across 40 scenarios and 14 harmful categories, evaluating four LLM families of different sizes. Results show all models are vulnerable to RED QUEEN ATTACK, reaching 87.6% attack success rate (ASR) on GPT-4o and 77.1% on Llama3-70B. Compared to prior jailbreak attacks, the RED QUEEN ATTACK achieves superior performance on nine out of ten models, with ASR improvements ranging from 2% to 64%. Further analysis reveals that larger models exhibit greater vulnerability to our attack, primarily due to the combination of multi-turn structures and concealment strategies. To enhance safety, we propose RED QUEEN GUARD, a mitigation strategy reducing ASR to below 1% while maintaining model performance on standard benchmarks. Full implementation and dataset are publicly accessible at https://github.com/kriti-hippo/red_queen.
%R 10.18653/v1/2025.findings-acl.1311
%U https://aclanthology.org/2025.findings-acl.1311/
%U https://doi.org/10.18653/v1/2025.findings-acl.1311
%P 25554-25591
Markdown (Informal)
[Red Queen: Exposing Latent Multi-Turn Risks in Large Language Models](https://aclanthology.org/2025.findings-acl.1311/) (Jiang et al., Findings 2025)
ACL