@inproceedings{wang-etal-2026-cacheprune,
title = "{C}ache{P}rune: Teaching {LLM}s What Not to Follow via {KV}-Cache Editing",
author = "Wang, Rui and
Wu, Junda and
Xia, Yu and
Yu, Tong and
Zhang, Ruiyi and
Rossi, Ryan A. and
Mitra, Subrata and
Yao, Lina and
McAuley, Julian",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.70/",
pages = "1551--1570",
ISBN = "979-8-89176-390-6",
abstract = "Large Language Models (LLMs) are susceptible to indirect prompt injection attack, where the model inadvertently responds to instructions injected into the prompt context. This vulnerability stems from LLMs' inability to distinguish between data and instructions within a prompt. We propose CachePrune that defends against this attack by identifying and pruning neurons associated with instruction-following, during KV cache encoding of the prompt context. The pruning steers the LLM toward interpreting the context purely as data rather than as instructions to follow. To identify these neurons, we introduce a neural attribution mechanism guided by a preferential attribution loss, and theoretically connect this loss to an upper bound of the Direct Preference Optimization (DPO) objective. Further, we improve on the fidelity of neural attribution by leveraging an observed triggering effect in instruction-following. Our approach does not interfere with prompt formatting or incur test-time overhead in response generation. Experiments show that CachePrune significantly reduces the attack success rate while preserving the LLM{'}s ability to follow user instructions."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2026-cacheprune">
<titleInfo>
<title>CachePrune: Teaching LLMs What Not to Follow via KV-Cache Editing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junda</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruiyi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Rossi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Subrata</namePart>
<namePart type="family">Mitra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lina</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julian</namePart>
<namePart type="family">McAuley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) are susceptible to indirect prompt injection attack, where the model inadvertently responds to instructions injected into the prompt context. This vulnerability stems from LLMs’ inability to distinguish between data and instructions within a prompt. We propose CachePrune that defends against this attack by identifying and pruning neurons associated with instruction-following, during KV cache encoding of the prompt context. The pruning steers the LLM toward interpreting the context purely as data rather than as instructions to follow. To identify these neurons, we introduce a neural attribution mechanism guided by a preferential attribution loss, and theoretically connect this loss to an upper bound of the Direct Preference Optimization (DPO) objective. Further, we improve on the fidelity of neural attribution by leveraging an observed triggering effect in instruction-following. Our approach does not interfere with prompt formatting or incur test-time overhead in response generation. Experiments show that CachePrune significantly reduces the attack success rate while preserving the LLM’s ability to follow user instructions.</abstract>
<identifier type="citekey">wang-etal-2026-cacheprune</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.70/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1551</start>
<end>1570</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CachePrune: Teaching LLMs What Not to Follow via KV-Cache Editing
%A Wang, Rui
%A Wu, Junda
%A Xia, Yu
%A Yu, Tong
%A Zhang, Ruiyi
%A Rossi, Ryan A.
%A Mitra, Subrata
%A Yao, Lina
%A McAuley, Julian
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F wang-etal-2026-cacheprune
%X Large Language Models (LLMs) are susceptible to indirect prompt injection attack, where the model inadvertently responds to instructions injected into the prompt context. This vulnerability stems from LLMs’ inability to distinguish between data and instructions within a prompt. We propose CachePrune that defends against this attack by identifying and pruning neurons associated with instruction-following, during KV cache encoding of the prompt context. The pruning steers the LLM toward interpreting the context purely as data rather than as instructions to follow. To identify these neurons, we introduce a neural attribution mechanism guided by a preferential attribution loss, and theoretically connect this loss to an upper bound of the Direct Preference Optimization (DPO) objective. Further, we improve on the fidelity of neural attribution by leveraging an observed triggering effect in instruction-following. Our approach does not interfere with prompt formatting or incur test-time overhead in response generation. Experiments show that CachePrune significantly reduces the attack success rate while preserving the LLM’s ability to follow user instructions.
%U https://aclanthology.org/2026.acl-long.70/
%P 1551-1570
Markdown (Informal)
[CachePrune: Teaching LLMs What Not to Follow via KV-Cache Editing](https://aclanthology.org/2026.acl-long.70/) (Wang et al., ACL 2026)
ACL
- Rui Wang, Junda Wu, Yu Xia, Tong Yu, Ruiyi Zhang, Ryan A. Rossi, Subrata Mitra, Lina Yao, and Julian McAuley. 2026. CachePrune: Teaching LLMs What Not to Follow via KV-Cache Editing. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 1551–1570, San Diego, California, United States. Association for Computational Linguistics.