@inproceedings{rahman-harris-2025-summary,
title = "Summary the Savior: Harmful Keyword and Query-based Summarization for {LLM} Jailbreak Defense",
author = "Rahman, Shagoto and
Harris, Ian",
editor = "Cao, Trista and
Das, Anubrata and
Kumarage, Tharindu and
Wan, Yixin and
Krishna, Satyapriya and
Mehrabi, Ninareh and
Dhamala, Jwala and
Ramakrishna, Anil and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul and
Chang, Kai-Wei",
booktitle = "Proceedings of the 5th Workshop on Trustworthy NLP (TrustNLP 2025)",
month = may,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.trustnlp-main.17/",
doi = "10.18653/v1/2025.trustnlp-main.17",
pages = "266--275",
ISBN = "979-8-89176-233-6",
abstract = "Large Language Models (LLMs) are widely used for their capabilities, but face threats from jailbreak attacks, which exploit LLMs to generate inappropriate information and bypass their defense system. Existing defenses are often specific to jailbreak attacks and as a result, a robust, attack-independent solution is needed to address both Natural Language Processing (NLP) ambiguities and attack variability. In this study, we have introduced, Summary The Savior, a novel jailbreak detection mechanism leveraging harmful keywords and query-based security-aware summary classification. By analyzing the illegal and improper contents of prompts within the summaries, the proposed method remains robust against attack diversity and NLP ambiguities. Two novel datasets for harmful keyword extraction and security aware summaries utilizing GPT-4 and Llama-3.1 70B respectively have been generated in this regard. Moreover, an ``ambiguous harmful'' class has been introduced to address content and intent ambiguities. Evaluation results demonstrate that, Summary The Savior achieves higher defense performance, outperforming state-of-the-art defense mechanisms namely Perplexity Filtering, SmoothLLM, Erase and Check with lowest attack success rates across various jailbreak attacks namely PAIR, GCG, JBC and Random Search, on Llama-2, Vicuna-13B and GPT-4. Our codes, models, and results are available at: https://github.com/shrestho10/SummaryTheSavior"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rahman-harris-2025-summary">
<titleInfo>
<title>Summary the Savior: Harmful Keyword and Query-based Summarization for LLM Jailbreak Defense</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shagoto</namePart>
<namePart type="family">Rahman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ian</namePart>
<namePart type="family">Harris</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Workshop on Trustworthy NLP (TrustNLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trista</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anubrata</namePart>
<namePart type="family">Das</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Kumarage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yixin</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Satyapriya</namePart>
<namePart type="family">Krishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ninareh</namePart>
<namePart type="family">Mehrabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jwala</namePart>
<namePart type="family">Dhamala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anil</namePart>
<namePart type="family">Ramakrishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aram</namePart>
<namePart type="family">Galystan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-233-6</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) are widely used for their capabilities, but face threats from jailbreak attacks, which exploit LLMs to generate inappropriate information and bypass their defense system. Existing defenses are often specific to jailbreak attacks and as a result, a robust, attack-independent solution is needed to address both Natural Language Processing (NLP) ambiguities and attack variability. In this study, we have introduced, Summary The Savior, a novel jailbreak detection mechanism leveraging harmful keywords and query-based security-aware summary classification. By analyzing the illegal and improper contents of prompts within the summaries, the proposed method remains robust against attack diversity and NLP ambiguities. Two novel datasets for harmful keyword extraction and security aware summaries utilizing GPT-4 and Llama-3.1 70B respectively have been generated in this regard. Moreover, an “ambiguous harmful” class has been introduced to address content and intent ambiguities. Evaluation results demonstrate that, Summary The Savior achieves higher defense performance, outperforming state-of-the-art defense mechanisms namely Perplexity Filtering, SmoothLLM, Erase and Check with lowest attack success rates across various jailbreak attacks namely PAIR, GCG, JBC and Random Search, on Llama-2, Vicuna-13B and GPT-4. Our codes, models, and results are available at: https://github.com/shrestho10/SummaryTheSavior</abstract>
<identifier type="citekey">rahman-harris-2025-summary</identifier>
<identifier type="doi">10.18653/v1/2025.trustnlp-main.17</identifier>
<location>
<url>https://aclanthology.org/2025.trustnlp-main.17/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>266</start>
<end>275</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Summary the Savior: Harmful Keyword and Query-based Summarization for LLM Jailbreak Defense
%A Rahman, Shagoto
%A Harris, Ian
%Y Cao, Trista
%Y Das, Anubrata
%Y Kumarage, Tharindu
%Y Wan, Yixin
%Y Krishna, Satyapriya
%Y Mehrabi, Ninareh
%Y Dhamala, Jwala
%Y Ramakrishna, Anil
%Y Galystan, Aram
%Y Kumar, Anoop
%Y Gupta, Rahul
%Y Chang, Kai-Wei
%S Proceedings of the 5th Workshop on Trustworthy NLP (TrustNLP 2025)
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-233-6
%F rahman-harris-2025-summary
%X Large Language Models (LLMs) are widely used for their capabilities, but face threats from jailbreak attacks, which exploit LLMs to generate inappropriate information and bypass their defense system. Existing defenses are often specific to jailbreak attacks and as a result, a robust, attack-independent solution is needed to address both Natural Language Processing (NLP) ambiguities and attack variability. In this study, we have introduced, Summary The Savior, a novel jailbreak detection mechanism leveraging harmful keywords and query-based security-aware summary classification. By analyzing the illegal and improper contents of prompts within the summaries, the proposed method remains robust against attack diversity and NLP ambiguities. Two novel datasets for harmful keyword extraction and security aware summaries utilizing GPT-4 and Llama-3.1 70B respectively have been generated in this regard. Moreover, an “ambiguous harmful” class has been introduced to address content and intent ambiguities. Evaluation results demonstrate that, Summary The Savior achieves higher defense performance, outperforming state-of-the-art defense mechanisms namely Perplexity Filtering, SmoothLLM, Erase and Check with lowest attack success rates across various jailbreak attacks namely PAIR, GCG, JBC and Random Search, on Llama-2, Vicuna-13B and GPT-4. Our codes, models, and results are available at: https://github.com/shrestho10/SummaryTheSavior
%R 10.18653/v1/2025.trustnlp-main.17
%U https://aclanthology.org/2025.trustnlp-main.17/
%U https://doi.org/10.18653/v1/2025.trustnlp-main.17
%P 266-275
Markdown (Informal)
[Summary the Savior: Harmful Keyword and Query-based Summarization for LLM Jailbreak Defense](https://aclanthology.org/2025.trustnlp-main.17/) (Rahman & Harris, TrustNLP 2025)
ACL