@inproceedings{huang-etal-2025-intrinsic,
title = "Intrinsic Model Weaknesses: How Priming Attacks Unveil Vulnerabilities in Large Language Models",
author = "Huang, Yuyi and
Zhan, Runzhe and
Wong, Derek F. and
Chao, Lidia S. and
Tao, Ailin",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.77/",
doi = "10.18653/v1/2025.findings-naacl.77",
pages = "1405--1425",
ISBN = "979-8-89176-195-7",
abstract = "Large language models (LLMs) have significantly influenced various industries but suffer from a critical flaw, the potential sensitivity of generating harmful content, which poses severe societal risks. We developed and tested novel attack strategies on popular LLMs to expose their vulnerabilities in generating inappropriate content. These strategies, inspired by psychological phenomena such as the ``Priming Effect'', ``Safe Attention Shift'', and ``Cognitive Dissonance'', effectively attack the models' guarding mechanisms. Our experiments achieved an attack success rate (ASR) of 100{\%} on various open-source models, including Meta{'}s Llama-3.2, Google{'}s Gemma-2, Mistral{'}s Mistral-NeMo, Falcon{'}s Falcon-mamba, Apple{'}s DCLM, Microsoft{'}s Phi3, and Qwen{'}s Qwen2.5, among others. Similarly, for closed-source models such as OpenAI{'}s GPT-4o, Google{'}s Gemini-1.5, and Claude-3.5, we observed an ASR of at least 95{\%} on the AdvBench dataset, which represents the current state-of-the-art. This study underscores the urgent need to reassess the use of generative models in critical applications to mitigate potential adverse societal impacts."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huang-etal-2025-intrinsic">
<titleInfo>
<title>Intrinsic Model Weaknesses: How Priming Attacks Unveil Vulnerabilities in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuyi</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Runzhe</namePart>
<namePart type="family">Zhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Derek</namePart>
<namePart type="given">F</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lidia</namePart>
<namePart type="given">S</namePart>
<namePart type="family">Chao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ailin</namePart>
<namePart type="family">Tao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>Large language models (LLMs) have significantly influenced various industries but suffer from a critical flaw, the potential sensitivity of generating harmful content, which poses severe societal risks. We developed and tested novel attack strategies on popular LLMs to expose their vulnerabilities in generating inappropriate content. These strategies, inspired by psychological phenomena such as the “Priming Effect”, “Safe Attention Shift”, and “Cognitive Dissonance”, effectively attack the models’ guarding mechanisms. Our experiments achieved an attack success rate (ASR) of 100% on various open-source models, including Meta’s Llama-3.2, Google’s Gemma-2, Mistral’s Mistral-NeMo, Falcon’s Falcon-mamba, Apple’s DCLM, Microsoft’s Phi3, and Qwen’s Qwen2.5, among others. Similarly, for closed-source models such as OpenAI’s GPT-4o, Google’s Gemini-1.5, and Claude-3.5, we observed an ASR of at least 95% on the AdvBench dataset, which represents the current state-of-the-art. This study underscores the urgent need to reassess the use of generative models in critical applications to mitigate potential adverse societal impacts.</abstract>
<identifier type="citekey">huang-etal-2025-intrinsic</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.77</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.77/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>1405</start>
<end>1425</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Intrinsic Model Weaknesses: How Priming Attacks Unveil Vulnerabilities in Large Language Models
%A Huang, Yuyi
%A Zhan, Runzhe
%A Wong, Derek F.
%A Chao, Lidia S.
%A Tao, Ailin
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F huang-etal-2025-intrinsic
%X Large language models (LLMs) have significantly influenced various industries but suffer from a critical flaw, the potential sensitivity of generating harmful content, which poses severe societal risks. We developed and tested novel attack strategies on popular LLMs to expose their vulnerabilities in generating inappropriate content. These strategies, inspired by psychological phenomena such as the “Priming Effect”, “Safe Attention Shift”, and “Cognitive Dissonance”, effectively attack the models’ guarding mechanisms. Our experiments achieved an attack success rate (ASR) of 100% on various open-source models, including Meta’s Llama-3.2, Google’s Gemma-2, Mistral’s Mistral-NeMo, Falcon’s Falcon-mamba, Apple’s DCLM, Microsoft’s Phi3, and Qwen’s Qwen2.5, among others. Similarly, for closed-source models such as OpenAI’s GPT-4o, Google’s Gemini-1.5, and Claude-3.5, we observed an ASR of at least 95% on the AdvBench dataset, which represents the current state-of-the-art. This study underscores the urgent need to reassess the use of generative models in critical applications to mitigate potential adverse societal impacts.
%R 10.18653/v1/2025.findings-naacl.77
%U https://aclanthology.org/2025.findings-naacl.77/
%U https://doi.org/10.18653/v1/2025.findings-naacl.77
%P 1405-1425
Markdown (Informal)
[Intrinsic Model Weaknesses: How Priming Attacks Unveil Vulnerabilities in Large Language Models](https://aclanthology.org/2025.findings-naacl.77/) (Huang et al., Findings 2025)
ACL