@inproceedings{yang-etal-2026-rap,
title = "{RAP}-{ID}: Mechanistic Prompt Injection Detection via Impostor Behavior Analysis",
author = "Yang, Yuchen and
Peng, Lei and
He, Yujie and
yu, Yang and
Wu, Zhongxin and
Shi, Yanlei",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.738/",
pages = "15008--15019",
ISBN = "979-8-89176-395-1",
abstract = "Large Language Models are increasingly integrated into critical applications, yet they remain vulnerable to prompt injection attacks where meticulously designed adversarial inputs bypass safety alignment. Existing defenses often rely on externally deployed guardrail models or response inspection, which incur significant computational overhead and latency. We propose RAP-ID (Robust Alignment Preservation via Injection Defense), a mechanistic, train-free detection framework that operates exclusively on internal state dynamics during the initial forward pass. RAP-ID identifies attacks by detecting their inevitable ``impostor'' behavior: they must mimic system instruction semantics (Directive Likeness), usurp attention from the true system prompt (Counterfactual Gain), and trigger latent risk concepts (Policy Conflict). By fusing these three internal signals, RAP-ID achieves effective detection across diverse attack vectors{---}from direct jailbreaks to stealthy agentic manipulations{---}without requiring text generation. Comprehensive evaluations demonstrate that RAP-ID achieves competitive performance with significant overall improvements compared to heuristic methods. Crucially, as a train-free solution, it incurs minimal computational overhead and delivers fast response times, making it well-suited for real-time deployment."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yang-etal-2026-rap">
<titleInfo>
<title>RAP-ID: Mechanistic Prompt Injection Detection via Impostor Behavior Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuchen</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yujie</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhongxin</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanlei</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Large Language Models are increasingly integrated into critical applications, yet they remain vulnerable to prompt injection attacks where meticulously designed adversarial inputs bypass safety alignment. Existing defenses often rely on externally deployed guardrail models or response inspection, which incur significant computational overhead and latency. We propose RAP-ID (Robust Alignment Preservation via Injection Defense), a mechanistic, train-free detection framework that operates exclusively on internal state dynamics during the initial forward pass. RAP-ID identifies attacks by detecting their inevitable “impostor” behavior: they must mimic system instruction semantics (Directive Likeness), usurp attention from the true system prompt (Counterfactual Gain), and trigger latent risk concepts (Policy Conflict). By fusing these three internal signals, RAP-ID achieves effective detection across diverse attack vectors—from direct jailbreaks to stealthy agentic manipulations—without requiring text generation. Comprehensive evaluations demonstrate that RAP-ID achieves competitive performance with significant overall improvements compared to heuristic methods. Crucially, as a train-free solution, it incurs minimal computational overhead and delivers fast response times, making it well-suited for real-time deployment.</abstract>
<identifier type="citekey">yang-etal-2026-rap</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.738/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>15008</start>
<end>15019</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T RAP-ID: Mechanistic Prompt Injection Detection via Impostor Behavior Analysis
%A Yang, Yuchen
%A Peng, Lei
%A He, Yujie
%A yu, Yang
%A Wu, Zhongxin
%A Shi, Yanlei
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F yang-etal-2026-rap
%X Large Language Models are increasingly integrated into critical applications, yet they remain vulnerable to prompt injection attacks where meticulously designed adversarial inputs bypass safety alignment. Existing defenses often rely on externally deployed guardrail models or response inspection, which incur significant computational overhead and latency. We propose RAP-ID (Robust Alignment Preservation via Injection Defense), a mechanistic, train-free detection framework that operates exclusively on internal state dynamics during the initial forward pass. RAP-ID identifies attacks by detecting their inevitable “impostor” behavior: they must mimic system instruction semantics (Directive Likeness), usurp attention from the true system prompt (Counterfactual Gain), and trigger latent risk concepts (Policy Conflict). By fusing these three internal signals, RAP-ID achieves effective detection across diverse attack vectors—from direct jailbreaks to stealthy agentic manipulations—without requiring text generation. Comprehensive evaluations demonstrate that RAP-ID achieves competitive performance with significant overall improvements compared to heuristic methods. Crucially, as a train-free solution, it incurs minimal computational overhead and delivers fast response times, making it well-suited for real-time deployment.
%U https://aclanthology.org/2026.findings-acl.738/
%P 15008-15019
Markdown (Informal)
[RAP-ID: Mechanistic Prompt Injection Detection via Impostor Behavior Analysis](https://aclanthology.org/2026.findings-acl.738/) (Yang et al., Findings 2026)
ACL