@inproceedings{yan-etal-2025-semanticcamo,
title = "{S}emantic{C}amo: Jailbreaking Large Language Models through Semantic Camouflage",
author = "Yan, Jihui and
Yang, Xiaocui and
Wang, Daling and
Feng, Shi and
Zhang, Yifei and
Zhao, Yinzhi",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.745/",
doi = "10.18653/v1/2025.findings-acl.745",
pages = "14427--14452",
ISBN = "979-8-89176-256-5",
abstract = "The rapid development and increasingly widespread applications of Large Language Models (LLMs) have made the safety issues of LLMs more prominent and critical. Although safety training is widely used in LLMs, the mismatch between pre-training and safety training still leads to safety vulnerabilities. To expose the safety vulnerabilities in LLMs and improve LLMs' performance in safety, we propose a novel framework, SemanticCamo, which attacks LLMs through semantic camouflage.SemanticCamo bypasses safety guardrails by replacing the original unsafe content with semantic features, thereby concealing malicious intent while keeping the query{'}s objectives unchanged. We conduct comprehensive experiments on the state-of-the-art LLMs, including GPT-4o and Claude-3.5, finding that SemanticCamo successfully induces harmful responses from the target models in over 80{\%} of cases on average, outperforming previous counterparts. Additionally, the performance of SemanticCamo against various defenses is evaluated, demonstrating that semantic transformations introduce critical challenges to LLM safety, necessitating targeted alignment strategies to address this vulnerability. Code and data are available at https://github.com/Jihui-Yan/SemanticCamo."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yan-etal-2025-semanticcamo">
<titleInfo>
<title>SemanticCamo: Jailbreaking Large Language Models through Semantic Camouflage</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jihui</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaocui</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daling</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shi</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yifei</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yinzhi</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>The rapid development and increasingly widespread applications of Large Language Models (LLMs) have made the safety issues of LLMs more prominent and critical. Although safety training is widely used in LLMs, the mismatch between pre-training and safety training still leads to safety vulnerabilities. To expose the safety vulnerabilities in LLMs and improve LLMs’ performance in safety, we propose a novel framework, SemanticCamo, which attacks LLMs through semantic camouflage.SemanticCamo bypasses safety guardrails by replacing the original unsafe content with semantic features, thereby concealing malicious intent while keeping the query’s objectives unchanged. We conduct comprehensive experiments on the state-of-the-art LLMs, including GPT-4o and Claude-3.5, finding that SemanticCamo successfully induces harmful responses from the target models in over 80% of cases on average, outperforming previous counterparts. Additionally, the performance of SemanticCamo against various defenses is evaluated, demonstrating that semantic transformations introduce critical challenges to LLM safety, necessitating targeted alignment strategies to address this vulnerability. Code and data are available at https://github.com/Jihui-Yan/SemanticCamo.</abstract>
<identifier type="citekey">yan-etal-2025-semanticcamo</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.745</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.745/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>14427</start>
<end>14452</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SemanticCamo: Jailbreaking Large Language Models through Semantic Camouflage
%A Yan, Jihui
%A Yang, Xiaocui
%A Wang, Daling
%A Feng, Shi
%A Zhang, Yifei
%A Zhao, Yinzhi
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F yan-etal-2025-semanticcamo
%X The rapid development and increasingly widespread applications of Large Language Models (LLMs) have made the safety issues of LLMs more prominent and critical. Although safety training is widely used in LLMs, the mismatch between pre-training and safety training still leads to safety vulnerabilities. To expose the safety vulnerabilities in LLMs and improve LLMs’ performance in safety, we propose a novel framework, SemanticCamo, which attacks LLMs through semantic camouflage.SemanticCamo bypasses safety guardrails by replacing the original unsafe content with semantic features, thereby concealing malicious intent while keeping the query’s objectives unchanged. We conduct comprehensive experiments on the state-of-the-art LLMs, including GPT-4o and Claude-3.5, finding that SemanticCamo successfully induces harmful responses from the target models in over 80% of cases on average, outperforming previous counterparts. Additionally, the performance of SemanticCamo against various defenses is evaluated, demonstrating that semantic transformations introduce critical challenges to LLM safety, necessitating targeted alignment strategies to address this vulnerability. Code and data are available at https://github.com/Jihui-Yan/SemanticCamo.
%R 10.18653/v1/2025.findings-acl.745
%U https://aclanthology.org/2025.findings-acl.745/
%U https://doi.org/10.18653/v1/2025.findings-acl.745
%P 14427-14452
Markdown (Informal)
[SemanticCamo: Jailbreaking Large Language Models through Semantic Camouflage](https://aclanthology.org/2025.findings-acl.745/) (Yan et al., Findings 2025)
ACL