@inproceedings{zhang-etal-2025-wordgame,
title = "{W}ord{G}ame: Efficient {\&} Effective {LLM} Jailbreak via Simultaneous Obfuscation in Query and Response",
author = "Zhang, Tianrong and
Cao, Bochuan and
Cao, Yuanpu and
Lin, Lu and
Mitra, Prasenjit and
Chen, Jinghui",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.269/",
doi = "10.18653/v1/2025.findings-naacl.269",
pages = "4779--4807",
ISBN = "979-8-89176-195-7",
abstract = "The recent breakthrough in large language models (LLMs) such as ChatGPT has revolutionized every industry at an unprecedented pace. Alongside this progress also comes mounting concerns about LLMs' susceptibility to jailbreaking attacks, which leads to the generation of harmful or unsafe content. While safety alignment measures have been implemented in LLMs to mitigate existing jailbreak attempts and force them to become increasingly complicated, it is still far from perfect. In this paper, we analyze the common pattern of the current safety alignment and show that it is possible to exploit such patterns for jailbreaking attacks by simultaneous obfuscation in queries and responses. Specifically, we propose WordGame attack, which replaces malicious words with word games to break down the adversarial intent of a query and encourage benign content regarding the games to precede the anticipated harmful content in the response, creating a context that is hardly covered by any corpus used for safety alignment. Extensive experiments demonstrate that WordGame attack can break the guardrails of the current leading proprietary and open-source LLMs, including the latest Claude 3, GPT 4, and Llama 3 models more effectively than existing attacks efficiently. The attack also remains powerful when external defenses are adopted. Further ablation studies on such simultaneous obfuscation in query and response provide evidence of the merits of the attack strategy beyond an individual attack."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2025-wordgame">
<titleInfo>
<title>WordGame: Efficient & Effective LLM Jailbreak via Simultaneous Obfuscation in Query and Response</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tianrong</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bochuan</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuanpu</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prasenjit</namePart>
<namePart type="family">Mitra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinghui</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>The recent breakthrough in large language models (LLMs) such as ChatGPT has revolutionized every industry at an unprecedented pace. Alongside this progress also comes mounting concerns about LLMs’ susceptibility to jailbreaking attacks, which leads to the generation of harmful or unsafe content. While safety alignment measures have been implemented in LLMs to mitigate existing jailbreak attempts and force them to become increasingly complicated, it is still far from perfect. In this paper, we analyze the common pattern of the current safety alignment and show that it is possible to exploit such patterns for jailbreaking attacks by simultaneous obfuscation in queries and responses. Specifically, we propose WordGame attack, which replaces malicious words with word games to break down the adversarial intent of a query and encourage benign content regarding the games to precede the anticipated harmful content in the response, creating a context that is hardly covered by any corpus used for safety alignment. Extensive experiments demonstrate that WordGame attack can break the guardrails of the current leading proprietary and open-source LLMs, including the latest Claude 3, GPT 4, and Llama 3 models more effectively than existing attacks efficiently. The attack also remains powerful when external defenses are adopted. Further ablation studies on such simultaneous obfuscation in query and response provide evidence of the merits of the attack strategy beyond an individual attack.</abstract>
<identifier type="citekey">zhang-etal-2025-wordgame</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.269</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.269/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>4779</start>
<end>4807</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T WordGame: Efficient & Effective LLM Jailbreak via Simultaneous Obfuscation in Query and Response
%A Zhang, Tianrong
%A Cao, Bochuan
%A Cao, Yuanpu
%A Lin, Lu
%A Mitra, Prasenjit
%A Chen, Jinghui
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F zhang-etal-2025-wordgame
%X The recent breakthrough in large language models (LLMs) such as ChatGPT has revolutionized every industry at an unprecedented pace. Alongside this progress also comes mounting concerns about LLMs’ susceptibility to jailbreaking attacks, which leads to the generation of harmful or unsafe content. While safety alignment measures have been implemented in LLMs to mitigate existing jailbreak attempts and force them to become increasingly complicated, it is still far from perfect. In this paper, we analyze the common pattern of the current safety alignment and show that it is possible to exploit such patterns for jailbreaking attacks by simultaneous obfuscation in queries and responses. Specifically, we propose WordGame attack, which replaces malicious words with word games to break down the adversarial intent of a query and encourage benign content regarding the games to precede the anticipated harmful content in the response, creating a context that is hardly covered by any corpus used for safety alignment. Extensive experiments demonstrate that WordGame attack can break the guardrails of the current leading proprietary and open-source LLMs, including the latest Claude 3, GPT 4, and Llama 3 models more effectively than existing attacks efficiently. The attack also remains powerful when external defenses are adopted. Further ablation studies on such simultaneous obfuscation in query and response provide evidence of the merits of the attack strategy beyond an individual attack.
%R 10.18653/v1/2025.findings-naacl.269
%U https://aclanthology.org/2025.findings-naacl.269/
%U https://doi.org/10.18653/v1/2025.findings-naacl.269
%P 4779-4807
Markdown (Informal)
[WordGame: Efficient & Effective LLM Jailbreak via Simultaneous Obfuscation in Query and Response](https://aclanthology.org/2025.findings-naacl.269/) (Zhang et al., Findings 2025)
ACL