@inproceedings{ji-etal-2025-defending,
title = "Defending Large Language Models against Jailbreak Attacks via Semantic Smoothing",
author = "Ji, Jiabao and
Hou, Bairu and
Robey, Alexander and
Pappas, George J. and
Hassani, Hamed and
Zhang, Yang and
Wong, Eric and
Chang, Shiyu",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://aclanthology.org/2025.ijcnlp-short.2/",
pages = "7--40",
ISBN = "979-8-89176-299-2",
abstract = "Aligned large language models (LLMs) are vulnerable to jailbreaks, which bypass the safeguards of targeted LLMs and fool them into generating objectionable content. While initial defenses show promise against token-based attacks, there are no defenses that provide robustness against semantic attacks and avoid unfavorable trade-offs between robustness and nominal performance. To meet this need, we propose SemanticSmooth, a smoothing-based defense that aggregates the predictions of multiple semantically transformed copies of a given input prompt. Experimental results demonstrate that SemanticSmooth achieves strong robustness against both manually constructed jailbreak prompts and automatic jailbreak attacks like GCG, PAIR, and PromptRS while maintaining strong nominal performance on standard LLM evaluation benchmarks such as AlpacaEval for the instruction-following tasks and PiQA for the question-answering tasks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ji-etal-2025-defending">
<titleInfo>
<title>Defending Large Language Models against Jailbreak Attacks via Semantic Smoothing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiabao</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bairu</namePart>
<namePart type="family">Hou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Robey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">George</namePart>
<namePart type="given">J</namePart>
<namePart type="family">Pappas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamed</namePart>
<namePart type="family">Hassani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shiyu</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haofen</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Derek</namePart>
<namePart type="given">F</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Biplab</namePart>
<namePart type="family">Banerjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asif</namePart>
<namePart type="family">Ekbal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhirendra</namePart>
<namePart type="given">Pratap</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The Asian Federation of Natural Language Processing and The Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-299-2</identifier>
</relatedItem>
<abstract>Aligned large language models (LLMs) are vulnerable to jailbreaks, which bypass the safeguards of targeted LLMs and fool them into generating objectionable content. While initial defenses show promise against token-based attacks, there are no defenses that provide robustness against semantic attacks and avoid unfavorable trade-offs between robustness and nominal performance. To meet this need, we propose SemanticSmooth, a smoothing-based defense that aggregates the predictions of multiple semantically transformed copies of a given input prompt. Experimental results demonstrate that SemanticSmooth achieves strong robustness against both manually constructed jailbreak prompts and automatic jailbreak attacks like GCG, PAIR, and PromptRS while maintaining strong nominal performance on standard LLM evaluation benchmarks such as AlpacaEval for the instruction-following tasks and PiQA for the question-answering tasks.</abstract>
<identifier type="citekey">ji-etal-2025-defending</identifier>
<location>
<url>https://aclanthology.org/2025.ijcnlp-short.2/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>7</start>
<end>40</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Defending Large Language Models against Jailbreak Attacks via Semantic Smoothing
%A Ji, Jiabao
%A Hou, Bairu
%A Robey, Alexander
%A Pappas, George J.
%A Hassani, Hamed
%A Zhang, Yang
%A Wong, Eric
%A Chang, Shiyu
%Y Inui, Kentaro
%Y Sakti, Sakriani
%Y Wang, Haofen
%Y Wong, Derek F.
%Y Bhattacharyya, Pushpak
%Y Banerjee, Biplab
%Y Ekbal, Asif
%Y Chakraborty, Tanmoy
%Y Singh, Dhirendra Pratap
%S Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics
%D 2025
%8 December
%I The Asian Federation of Natural Language Processing and The Association for Computational Linguistics
%C Mumbai, India
%@ 979-8-89176-299-2
%F ji-etal-2025-defending
%X Aligned large language models (LLMs) are vulnerable to jailbreaks, which bypass the safeguards of targeted LLMs and fool them into generating objectionable content. While initial defenses show promise against token-based attacks, there are no defenses that provide robustness against semantic attacks and avoid unfavorable trade-offs between robustness and nominal performance. To meet this need, we propose SemanticSmooth, a smoothing-based defense that aggregates the predictions of multiple semantically transformed copies of a given input prompt. Experimental results demonstrate that SemanticSmooth achieves strong robustness against both manually constructed jailbreak prompts and automatic jailbreak attacks like GCG, PAIR, and PromptRS while maintaining strong nominal performance on standard LLM evaluation benchmarks such as AlpacaEval for the instruction-following tasks and PiQA for the question-answering tasks.
%U https://aclanthology.org/2025.ijcnlp-short.2/
%P 7-40
Markdown (Informal)
[Defending Large Language Models against Jailbreak Attacks via Semantic Smoothing](https://aclanthology.org/2025.ijcnlp-short.2/) (Ji et al., IJCNLP-AACL 2025)
ACL
- Jiabao Ji, Bairu Hou, Alexander Robey, George J. Pappas, Hamed Hassani, Yang Zhang, Eric Wong, and Shiyu Chang. 2025. Defending Large Language Models against Jailbreak Attacks via Semantic Smoothing. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 7–40, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.