@inproceedings{rubenbauer-etal-2026-detection,
title = "Detection of Adversarial Prompts with Model Predictive Entropy",
author = {Rubenbauer, Franziska and
Steindl, Sebastian and
Levi, Patrick and
Loebenberger, Daniel and
Sch{\"a}fer, Ulrich},
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.103/",
pages = "1979--1993",
ISBN = "979-8-89176-386-9",
abstract = "Large Language Models (LLMs) are increasingly deployed in high-impact scenarios, raising concerns about their safety and security. Despite existing defense mechanisms, LLMs remain vulnerable to adversarial attacks. This paper introduces the novel attack-agnostic pipeline SENTRY (semantic entropy-based attack recognition system) for detecting such attacks by leveraging the predictive entropy of model outputs, quantified through the Token-Level Shifting Attention to Relevance (TokenSAR) score, a weighted token entropy measurement. Our approach dynamically identifies adversarial inputs without relying on prior knowledge of attack specifications. It requires only ten newly generated tokens, making it a computationally efficient and adaptable solution. We evaluate the pipeline on multiple state-of-the-art models, including Llama, Vicuna, Falcon, Deep Seek, and Mistral, using a diverse set of adversarial prompts generated via the h4rm31 framework. Experimental results demonstrate a clear separation in TokenSAR scores between benign, malicious, and adversarial prompts. This distinction enables effective threshold-based classification, achieving robust detection performance across various model architectures. Our method outperforms traditional defenses in terms of adaptability and resource efficiency."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rubenbauer-etal-2026-detection">
<titleInfo>
<title>Detection of Adversarial Prompts with Model Predictive Entropy</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franziska</namePart>
<namePart type="family">Rubenbauer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Steindl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Levi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Loebenberger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ulrich</namePart>
<namePart type="family">Schäfer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) are increasingly deployed in high-impact scenarios, raising concerns about their safety and security. Despite existing defense mechanisms, LLMs remain vulnerable to adversarial attacks. This paper introduces the novel attack-agnostic pipeline SENTRY (semantic entropy-based attack recognition system) for detecting such attacks by leveraging the predictive entropy of model outputs, quantified through the Token-Level Shifting Attention to Relevance (TokenSAR) score, a weighted token entropy measurement. Our approach dynamically identifies adversarial inputs without relying on prior knowledge of attack specifications. It requires only ten newly generated tokens, making it a computationally efficient and adaptable solution. We evaluate the pipeline on multiple state-of-the-art models, including Llama, Vicuna, Falcon, Deep Seek, and Mistral, using a diverse set of adversarial prompts generated via the h4rm31 framework. Experimental results demonstrate a clear separation in TokenSAR scores between benign, malicious, and adversarial prompts. This distinction enables effective threshold-based classification, achieving robust detection performance across various model architectures. Our method outperforms traditional defenses in terms of adaptability and resource efficiency.</abstract>
<identifier type="citekey">rubenbauer-etal-2026-detection</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.103/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>1979</start>
<end>1993</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Detection of Adversarial Prompts with Model Predictive Entropy
%A Rubenbauer, Franziska
%A Steindl, Sebastian
%A Levi, Patrick
%A Loebenberger, Daniel
%A Schäfer, Ulrich
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F rubenbauer-etal-2026-detection
%X Large Language Models (LLMs) are increasingly deployed in high-impact scenarios, raising concerns about their safety and security. Despite existing defense mechanisms, LLMs remain vulnerable to adversarial attacks. This paper introduces the novel attack-agnostic pipeline SENTRY (semantic entropy-based attack recognition system) for detecting such attacks by leveraging the predictive entropy of model outputs, quantified through the Token-Level Shifting Attention to Relevance (TokenSAR) score, a weighted token entropy measurement. Our approach dynamically identifies adversarial inputs without relying on prior knowledge of attack specifications. It requires only ten newly generated tokens, making it a computationally efficient and adaptable solution. We evaluate the pipeline on multiple state-of-the-art models, including Llama, Vicuna, Falcon, Deep Seek, and Mistral, using a diverse set of adversarial prompts generated via the h4rm31 framework. Experimental results demonstrate a clear separation in TokenSAR scores between benign, malicious, and adversarial prompts. This distinction enables effective threshold-based classification, achieving robust detection performance across various model architectures. Our method outperforms traditional defenses in terms of adaptability and resource efficiency.
%U https://aclanthology.org/2026.findings-eacl.103/
%P 1979-1993
Markdown (Informal)
[Detection of Adversarial Prompts with Model Predictive Entropy](https://aclanthology.org/2026.findings-eacl.103/) (Rubenbauer et al., Findings 2026)
ACL
- Franziska Rubenbauer, Sebastian Steindl, Patrick Levi, Daniel Loebenberger, and Ulrich Schäfer. 2026. Detection of Adversarial Prompts with Model Predictive Entropy. In Findings of the Association for Computational Linguistics: EACL 2026, pages 1979–1993, Rabat, Morocco. Association for Computational Linguistics.