@inproceedings{xiang-etal-2025-alrphfs,
title = "{ALRPHFS}: Adversarially Learned Risk Patterns with Hierarchical Fast {\&} Slow Reasoning for Robust Agent Defense",
author = "Xiang, Shiyu and
Zhang, Tong and
Chen, Ronghao",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1066/",
pages = "19569--19587",
ISBN = "979-8-89176-335-7",
abstract = "LLM Agents are becoming central to intelligent systems. However, their deployment raises serious safety concerns. Existing defenses largely rely on ``Safety Checks'', which struggle to capture the complex semantic risks posed by harmful user inputs or unsafe agent behaviors{---}creating a significant semantic gap between safety checks and real-world risks. To bridge this gap, we propose a novel defense framework, ALRPHFS (Adversarially Learned Risk Patterns with Hierarchical Fast {\&} Slow Reasoning). ALRPHFS consists of two core components: (1) an offline adversarial self-learning loop to iteratively refine a generalizable and balanced library of risk patterns, substantially enhancing robustness without retraining the base LLM, and (2) an online hierarchical fast {\&} slow reasoning engine that balances detection effectiveness with computational efficiency. Experimental results demonstrate that our approach achieves superior overall performance compared to existing baselines, achieving a best{-}in{-}class average accuracy of 80{\%} and exhibiting strong generalizability across agents and tasks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xiang-etal-2025-alrphfs">
<titleInfo>
<title>ALRPHFS: Adversarially Learned Risk Patterns with Hierarchical Fast & Slow Reasoning for Robust Agent Defense</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shiyu</namePart>
<namePart type="family">Xiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ronghao</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>LLM Agents are becoming central to intelligent systems. However, their deployment raises serious safety concerns. Existing defenses largely rely on “Safety Checks”, which struggle to capture the complex semantic risks posed by harmful user inputs or unsafe agent behaviors—creating a significant semantic gap between safety checks and real-world risks. To bridge this gap, we propose a novel defense framework, ALRPHFS (Adversarially Learned Risk Patterns with Hierarchical Fast & Slow Reasoning). ALRPHFS consists of two core components: (1) an offline adversarial self-learning loop to iteratively refine a generalizable and balanced library of risk patterns, substantially enhancing robustness without retraining the base LLM, and (2) an online hierarchical fast & slow reasoning engine that balances detection effectiveness with computational efficiency. Experimental results demonstrate that our approach achieves superior overall performance compared to existing baselines, achieving a best-in-class average accuracy of 80% and exhibiting strong generalizability across agents and tasks.</abstract>
<identifier type="citekey">xiang-etal-2025-alrphfs</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1066/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>19569</start>
<end>19587</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ALRPHFS: Adversarially Learned Risk Patterns with Hierarchical Fast & Slow Reasoning for Robust Agent Defense
%A Xiang, Shiyu
%A Zhang, Tong
%A Chen, Ronghao
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F xiang-etal-2025-alrphfs
%X LLM Agents are becoming central to intelligent systems. However, their deployment raises serious safety concerns. Existing defenses largely rely on “Safety Checks”, which struggle to capture the complex semantic risks posed by harmful user inputs or unsafe agent behaviors—creating a significant semantic gap between safety checks and real-world risks. To bridge this gap, we propose a novel defense framework, ALRPHFS (Adversarially Learned Risk Patterns with Hierarchical Fast & Slow Reasoning). ALRPHFS consists of two core components: (1) an offline adversarial self-learning loop to iteratively refine a generalizable and balanced library of risk patterns, substantially enhancing robustness without retraining the base LLM, and (2) an online hierarchical fast & slow reasoning engine that balances detection effectiveness with computational efficiency. Experimental results demonstrate that our approach achieves superior overall performance compared to existing baselines, achieving a best-in-class average accuracy of 80% and exhibiting strong generalizability across agents and tasks.
%U https://aclanthology.org/2025.findings-emnlp.1066/
%P 19569-19587
Markdown (Informal)
[ALRPHFS: Adversarially Learned Risk Patterns with Hierarchical Fast & Slow Reasoning for Robust Agent Defense](https://aclanthology.org/2025.findings-emnlp.1066/) (Xiang et al., Findings 2025)
ACL