@inproceedings{padakandla-etal-2025-safequant,
title = "{S}afe{Q}uant: {LLM} Safety Analysis via Quantized Gradient Inspection",
author = "Padakandla, Sindhu and
Babar, Sadbhavana and
D, Rathod Darshan and
Kaul, Manohar",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.127/",
doi = "10.18653/v1/2025.naacl-long.127",
pages = "2522--2536",
ISBN = "979-8-89176-189-6",
abstract = "Contemporary jailbreak attacks on Large Language Models (LLMs) employ sophisticated techniques with obfuscated content to bypass safety guardrails. Existing defenses either use computationally intensive LLM verification or require adversarial fine-tuning, leaving models vulnerable to advanced attacks. We introduce SafeQuant, a novel defense framework that leverages quantized gradient patterns to identify harmful prompts efficiently. Our key insight is that when generating identical responses like ``Sure'', LLMs exhibit distinctly different internal gradient patterns for safe versus harmful prompts, reflecting conflicts with safety training. By capturing these patterns through selective gradient masking and quantization, SafeQuant significantly outperforms existing defenses across multiple benchmarks while maintaining model utility. The method demonstrates particular effectiveness against sophisticated attacks like WordGame prompts and persuasive adversarial attacks, achieving an F1-score of 0.80 on WordGame dataset and outperforming state-of-the-art (SoTA) methods like GradSafe by an absolute margin of 57{\%}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="padakandla-etal-2025-safequant">
<titleInfo>
<title>SafeQuant: LLM Safety Analysis via Quantized Gradient Inspection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sindhu</namePart>
<namePart type="family">Padakandla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sadbhavana</namePart>
<namePart type="family">Babar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rathod</namePart>
<namePart type="given">Darshan</namePart>
<namePart type="family">D</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manohar</namePart>
<namePart type="family">Kaul</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Contemporary jailbreak attacks on Large Language Models (LLMs) employ sophisticated techniques with obfuscated content to bypass safety guardrails. Existing defenses either use computationally intensive LLM verification or require adversarial fine-tuning, leaving models vulnerable to advanced attacks. We introduce SafeQuant, a novel defense framework that leverages quantized gradient patterns to identify harmful prompts efficiently. Our key insight is that when generating identical responses like “Sure”, LLMs exhibit distinctly different internal gradient patterns for safe versus harmful prompts, reflecting conflicts with safety training. By capturing these patterns through selective gradient masking and quantization, SafeQuant significantly outperforms existing defenses across multiple benchmarks while maintaining model utility. The method demonstrates particular effectiveness against sophisticated attacks like WordGame prompts and persuasive adversarial attacks, achieving an F1-score of 0.80 on WordGame dataset and outperforming state-of-the-art (SoTA) methods like GradSafe by an absolute margin of 57%.</abstract>
<identifier type="citekey">padakandla-etal-2025-safequant</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.127</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.127/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>2522</start>
<end>2536</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SafeQuant: LLM Safety Analysis via Quantized Gradient Inspection
%A Padakandla, Sindhu
%A Babar, Sadbhavana
%A D, Rathod Darshan
%A Kaul, Manohar
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F padakandla-etal-2025-safequant
%X Contemporary jailbreak attacks on Large Language Models (LLMs) employ sophisticated techniques with obfuscated content to bypass safety guardrails. Existing defenses either use computationally intensive LLM verification or require adversarial fine-tuning, leaving models vulnerable to advanced attacks. We introduce SafeQuant, a novel defense framework that leverages quantized gradient patterns to identify harmful prompts efficiently. Our key insight is that when generating identical responses like “Sure”, LLMs exhibit distinctly different internal gradient patterns for safe versus harmful prompts, reflecting conflicts with safety training. By capturing these patterns through selective gradient masking and quantization, SafeQuant significantly outperforms existing defenses across multiple benchmarks while maintaining model utility. The method demonstrates particular effectiveness against sophisticated attacks like WordGame prompts and persuasive adversarial attacks, achieving an F1-score of 0.80 on WordGame dataset and outperforming state-of-the-art (SoTA) methods like GradSafe by an absolute margin of 57%.
%R 10.18653/v1/2025.naacl-long.127
%U https://aclanthology.org/2025.naacl-long.127/
%U https://doi.org/10.18653/v1/2025.naacl-long.127
%P 2522-2536
Markdown (Informal)
[SafeQuant: LLM Safety Analysis via Quantized Gradient Inspection](https://aclanthology.org/2025.naacl-long.127/) (Padakandla et al., NAACL 2025)
ACL
- Sindhu Padakandla, Sadbhavana Babar, Rathod Darshan D, and Manohar Kaul. 2025. SafeQuant: LLM Safety Analysis via Quantized Gradient Inspection. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 2522–2536, Albuquerque, New Mexico. Association for Computational Linguistics.