@inproceedings{chen-etal-2026-robustness,
title = "Robustness via Referencing: Defending against Prompt Injection Attacks by Referencing the Executed Instruction",
author = "Chen, Yulin and
Li, Haoran and
Sui, Yuan and
Liu, Yue and
He, Yufei and
Bai, Xiaoling and
Fei, Chi and
Yabo, Li and
Ma, Haozhe and
Song, Yangqiu and
Hooi, Bryan",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.61/",
pages = "1194--1215",
ISBN = "979-8-89176-395-1",
abstract = "Prompt injection attacks manipulate large language models (LLMs) by misleading them to deviate from the original input instructions and execute maliciously injected instructions, because of their instruction-following capabilities and inability to distinguish between the original input instructions and maliciously injected instructions. Currently, various prompt injection defense methods have been proposed, including prompt-engineering-based approaches and fine-tuning methods. Most of these methods instruct the model to follow the original input instructions, suppressing its inherent tendencies to follow the injected instructions. However, experimental results reveal that suppressing the model{'}s instruction-following tendencies is challenging. After analyzing successful attack cases, we find that the LLMs can correctly reference the instructions they are executing in some cases. Motivated by this finding, we propose a defense method that leverages LLMs' instruction-following abilities rather than suppressing them. Our approach prompts LLMs to generate responses that include both the answers and their corresponding instruction references. Based on these references, we filter out answers whose references are not to the original input instructions. We conduct comprehensive experiments to evaluate the effectiveness of our proposed method. The results show that our approach outperforms prompt-engineering-based baselines and is comparable to fine-tuning methods, reducing the ASR to nearly 0{\%} in some scenarios. Moreover, our approach has minimal impact on overall utility."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2026-robustness">
<titleInfo>
<title>Robustness via Referencing: Defending against Prompt Injection Attacks by Referencing the Executed Instruction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yulin</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haoran</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuan</namePart>
<namePart type="family">Sui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yufei</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaoling</namePart>
<namePart type="family">Bai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chi</namePart>
<namePart type="family">Fei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="family">Yabo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haozhe</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yangqiu</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bryan</namePart>
<namePart type="family">Hooi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Prompt injection attacks manipulate large language models (LLMs) by misleading them to deviate from the original input instructions and execute maliciously injected instructions, because of their instruction-following capabilities and inability to distinguish between the original input instructions and maliciously injected instructions. Currently, various prompt injection defense methods have been proposed, including prompt-engineering-based approaches and fine-tuning methods. Most of these methods instruct the model to follow the original input instructions, suppressing its inherent tendencies to follow the injected instructions. However, experimental results reveal that suppressing the model’s instruction-following tendencies is challenging. After analyzing successful attack cases, we find that the LLMs can correctly reference the instructions they are executing in some cases. Motivated by this finding, we propose a defense method that leverages LLMs’ instruction-following abilities rather than suppressing them. Our approach prompts LLMs to generate responses that include both the answers and their corresponding instruction references. Based on these references, we filter out answers whose references are not to the original input instructions. We conduct comprehensive experiments to evaluate the effectiveness of our proposed method. The results show that our approach outperforms prompt-engineering-based baselines and is comparable to fine-tuning methods, reducing the ASR to nearly 0% in some scenarios. Moreover, our approach has minimal impact on overall utility.</abstract>
<identifier type="citekey">chen-etal-2026-robustness</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.61/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1194</start>
<end>1215</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Robustness via Referencing: Defending against Prompt Injection Attacks by Referencing the Executed Instruction
%A Chen, Yulin
%A Li, Haoran
%A Sui, Yuan
%A Liu, Yue
%A He, Yufei
%A Bai, Xiaoling
%A Fei, Chi
%A Yabo, Li
%A Ma, Haozhe
%A Song, Yangqiu
%A Hooi, Bryan
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F chen-etal-2026-robustness
%X Prompt injection attacks manipulate large language models (LLMs) by misleading them to deviate from the original input instructions and execute maliciously injected instructions, because of their instruction-following capabilities and inability to distinguish between the original input instructions and maliciously injected instructions. Currently, various prompt injection defense methods have been proposed, including prompt-engineering-based approaches and fine-tuning methods. Most of these methods instruct the model to follow the original input instructions, suppressing its inherent tendencies to follow the injected instructions. However, experimental results reveal that suppressing the model’s instruction-following tendencies is challenging. After analyzing successful attack cases, we find that the LLMs can correctly reference the instructions they are executing in some cases. Motivated by this finding, we propose a defense method that leverages LLMs’ instruction-following abilities rather than suppressing them. Our approach prompts LLMs to generate responses that include both the answers and their corresponding instruction references. Based on these references, we filter out answers whose references are not to the original input instructions. We conduct comprehensive experiments to evaluate the effectiveness of our proposed method. The results show that our approach outperforms prompt-engineering-based baselines and is comparable to fine-tuning methods, reducing the ASR to nearly 0% in some scenarios. Moreover, our approach has minimal impact on overall utility.
%U https://aclanthology.org/2026.findings-acl.61/
%P 1194-1215
Markdown (Informal)
[Robustness via Referencing: Defending against Prompt Injection Attacks by Referencing the Executed Instruction](https://aclanthology.org/2026.findings-acl.61/) (Chen et al., Findings 2026)
ACL
- Yulin Chen, Haoran Li, Yuan Sui, Yue Liu, Yufei He, Xiaoling Bai, Chi Fei, Li Yabo, Haozhe Ma, Yangqiu Song, and Bryan Hooi. 2026. Robustness via Referencing: Defending against Prompt Injection Attacks by Referencing the Executed Instruction. In Findings of the Association for Computational Linguistics: ACL 2026, pages 1194–1215, San Diego, California, United States. Association for Computational Linguistics.