@inproceedings{ha-etal-2025-one,
title = "{M2S}: Multi-turn to Single-turn jailbreak in Red Teaming for {LLM}s",
author = "Ha, Junwoo and
Kim, Hyunjun and
Yu, Sangyoon and
Park, Haon and
Yousefpour, Ashkan and
Park, Yuna and
Kim, Suhyun",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.805/",
doi = "10.18653/v1/2025.acl-long.805",
pages = "16489--16507",
ISBN = "979-8-89176-251-0",
abstract = "We introduce a novel framework for consolidating multi-turn adversarial ``jailbreak'' prompts into single-turn queries, significantly reducing the manual overhead required for adversarial testing of large language models (LLMs). While multi-turn human jailbreaks have been shown to yield high attack success rates (ASRs), they demand considerable human effort and time. Our proposed Multi-turn-to-Single-turn (M2S) methods{---}Hyphenize, Numberize, and Pythonize{---}systematically reformat multi-turn dialogues into structured single-turn prompts. Despite eliminating iterative back-and-forth interactions, these reformatted prompts preserve and often enhance adversarial potency: in extensive evaluations on the Multi-turn Human Jailbreak (MHJ) dataset, M2S methods yield ASRs ranging from 70.6 {\%} to 95.9 {\%} across various state-of-the-art LLMs. Remarkably, our single-turn prompts outperform the original multi-turn attacks by up to 17.5 {\%} in absolute ASR, while reducing token usage by more than half on average. Further analyses reveal that embedding malicious requests in enumerated or code-like structures exploits ``contextual blindness,'' undermining both native guardrails and external input-output safeguards. By consolidating multi-turn conversations into efficient single-turn prompts, our M2S framework provides a powerful tool for large-scale red-teaming and exposes critical vulnerabilities in contemporary LLM defenses. All code, data, and conversion prompts are available for reproducibility and further investigations: https://github.com/Junuha/M2S{\_}DATA"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ha-etal-2025-one">
<titleInfo>
<title>M2S: Multi-turn to Single-turn jailbreak in Red Teaming for LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Junwoo</namePart>
<namePart type="family">Ha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyunjun</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sangyoon</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haon</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashkan</namePart>
<namePart type="family">Yousefpour</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuna</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Suhyun</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>We introduce a novel framework for consolidating multi-turn adversarial “jailbreak” prompts into single-turn queries, significantly reducing the manual overhead required for adversarial testing of large language models (LLMs). While multi-turn human jailbreaks have been shown to yield high attack success rates (ASRs), they demand considerable human effort and time. Our proposed Multi-turn-to-Single-turn (M2S) methods—Hyphenize, Numberize, and Pythonize—systematically reformat multi-turn dialogues into structured single-turn prompts. Despite eliminating iterative back-and-forth interactions, these reformatted prompts preserve and often enhance adversarial potency: in extensive evaluations on the Multi-turn Human Jailbreak (MHJ) dataset, M2S methods yield ASRs ranging from 70.6 % to 95.9 % across various state-of-the-art LLMs. Remarkably, our single-turn prompts outperform the original multi-turn attacks by up to 17.5 % in absolute ASR, while reducing token usage by more than half on average. Further analyses reveal that embedding malicious requests in enumerated or code-like structures exploits “contextual blindness,” undermining both native guardrails and external input-output safeguards. By consolidating multi-turn conversations into efficient single-turn prompts, our M2S framework provides a powerful tool for large-scale red-teaming and exposes critical vulnerabilities in contemporary LLM defenses. All code, data, and conversion prompts are available for reproducibility and further investigations: https://github.com/Junuha/M2S_DATA</abstract>
<identifier type="citekey">ha-etal-2025-one</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.805</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.805/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>16489</start>
<end>16507</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T M2S: Multi-turn to Single-turn jailbreak in Red Teaming for LLMs
%A Ha, Junwoo
%A Kim, Hyunjun
%A Yu, Sangyoon
%A Park, Haon
%A Yousefpour, Ashkan
%A Park, Yuna
%A Kim, Suhyun
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F ha-etal-2025-one
%X We introduce a novel framework for consolidating multi-turn adversarial “jailbreak” prompts into single-turn queries, significantly reducing the manual overhead required for adversarial testing of large language models (LLMs). While multi-turn human jailbreaks have been shown to yield high attack success rates (ASRs), they demand considerable human effort and time. Our proposed Multi-turn-to-Single-turn (M2S) methods—Hyphenize, Numberize, and Pythonize—systematically reformat multi-turn dialogues into structured single-turn prompts. Despite eliminating iterative back-and-forth interactions, these reformatted prompts preserve and often enhance adversarial potency: in extensive evaluations on the Multi-turn Human Jailbreak (MHJ) dataset, M2S methods yield ASRs ranging from 70.6 % to 95.9 % across various state-of-the-art LLMs. Remarkably, our single-turn prompts outperform the original multi-turn attacks by up to 17.5 % in absolute ASR, while reducing token usage by more than half on average. Further analyses reveal that embedding malicious requests in enumerated or code-like structures exploits “contextual blindness,” undermining both native guardrails and external input-output safeguards. By consolidating multi-turn conversations into efficient single-turn prompts, our M2S framework provides a powerful tool for large-scale red-teaming and exposes critical vulnerabilities in contemporary LLM defenses. All code, data, and conversion prompts are available for reproducibility and further investigations: https://github.com/Junuha/M2S_DATA
%R 10.18653/v1/2025.acl-long.805
%U https://aclanthology.org/2025.acl-long.805/
%U https://doi.org/10.18653/v1/2025.acl-long.805
%P 16489-16507
Markdown (Informal)
[M2S: Multi-turn to Single-turn jailbreak in Red Teaming for LLMs](https://aclanthology.org/2025.acl-long.805/) (Ha et al., ACL 2025)
ACL
- Junwoo Ha, Hyunjun Kim, Sangyoon Yu, Haon Park, Ashkan Yousefpour, Yuna Park, and Suhyun Kim. 2025. M2S: Multi-turn to Single-turn jailbreak in Red Teaming for LLMs. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 16489–16507, Vienna, Austria. Association for Computational Linguistics.