@inproceedings{guo-etal-2026-personalization,
title = "When Personalization Legitimizes Risks: Uncovering Safety Vulnerabilities in Personalized Dialogue Agents",
author = "Guo, Jiahe and
Guo, Xiangran and
Hu, Yulin and
Long, Zimo and
Sui, Xingyu and
Zhi, Xuda and
Huang, Yongbo and
He, Hao and
Zhao, Weixiang and
Zhao, Yanyan and
Qin, Bing",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1260/",
pages = "27309--27335",
ISBN = "979-8-89176-390-6",
abstract = "Long-term memory enables large language model (LLM) agents to support personalized and sustained interactions.However, most work on personalized agents prioritizes utility and user experience, treating memory as a neutral component and largely overlooking its safety implications.In this paper, we reveal intent legitimation, a previously underexplored safety failure in personalized agents, where benign personal memories bias intent inference and cause models to legitimize inherently harmful queries.To study this phenomenon, we introduce PS-Bench, a benchmark designed to identify and quantify intent legitimation in personalized interactions.Across multiple memory-augmented agent frameworks and base LLMs, personalization increases attack success rates by **15.8{\%}{--}243.7{\%}** relative to stateless baselines.We further provide mechanistic evidence for intent legitimation from internal representation space, and propose a lightweight detection{--}reflection method that effectively reduces safety degradation.Overall, our work provides the first systematic exploration and evaluation of intent legitimation as a safety failure mode that naturally arises from benign, real-world personalization, highlighting the importance of assessing safety under long-term personal context. **WARNING:** This paper may contain harmful content."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="guo-etal-2026-personalization">
<titleInfo>
<title>When Personalization Legitimizes Risks: Uncovering Safety Vulnerabilities in Personalized Dialogue Agents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiahe</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiangran</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulin</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zimo</namePart>
<namePart type="family">Long</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xingyu</namePart>
<namePart type="family">Sui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuda</namePart>
<namePart type="family">Zhi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yongbo</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weixiang</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanyan</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bing</namePart>
<namePart type="family">Qin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Long-term memory enables large language model (LLM) agents to support personalized and sustained interactions.However, most work on personalized agents prioritizes utility and user experience, treating memory as a neutral component and largely overlooking its safety implications.In this paper, we reveal intent legitimation, a previously underexplored safety failure in personalized agents, where benign personal memories bias intent inference and cause models to legitimize inherently harmful queries.To study this phenomenon, we introduce PS-Bench, a benchmark designed to identify and quantify intent legitimation in personalized interactions.Across multiple memory-augmented agent frameworks and base LLMs, personalization increases attack success rates by **15.8%–243.7%** relative to stateless baselines.We further provide mechanistic evidence for intent legitimation from internal representation space, and propose a lightweight detection–reflection method that effectively reduces safety degradation.Overall, our work provides the first systematic exploration and evaluation of intent legitimation as a safety failure mode that naturally arises from benign, real-world personalization, highlighting the importance of assessing safety under long-term personal context. **WARNING:** This paper may contain harmful content.</abstract>
<identifier type="citekey">guo-etal-2026-personalization</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1260/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>27309</start>
<end>27335</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T When Personalization Legitimizes Risks: Uncovering Safety Vulnerabilities in Personalized Dialogue Agents
%A Guo, Jiahe
%A Guo, Xiangran
%A Hu, Yulin
%A Long, Zimo
%A Sui, Xingyu
%A Zhi, Xuda
%A Huang, Yongbo
%A He, Hao
%A Zhao, Weixiang
%A Zhao, Yanyan
%A Qin, Bing
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F guo-etal-2026-personalization
%X Long-term memory enables large language model (LLM) agents to support personalized and sustained interactions.However, most work on personalized agents prioritizes utility and user experience, treating memory as a neutral component and largely overlooking its safety implications.In this paper, we reveal intent legitimation, a previously underexplored safety failure in personalized agents, where benign personal memories bias intent inference and cause models to legitimize inherently harmful queries.To study this phenomenon, we introduce PS-Bench, a benchmark designed to identify and quantify intent legitimation in personalized interactions.Across multiple memory-augmented agent frameworks and base LLMs, personalization increases attack success rates by **15.8%–243.7%** relative to stateless baselines.We further provide mechanistic evidence for intent legitimation from internal representation space, and propose a lightweight detection–reflection method that effectively reduces safety degradation.Overall, our work provides the first systematic exploration and evaluation of intent legitimation as a safety failure mode that naturally arises from benign, real-world personalization, highlighting the importance of assessing safety under long-term personal context. **WARNING:** This paper may contain harmful content.
%U https://aclanthology.org/2026.acl-long.1260/
%P 27309-27335
Markdown (Informal)
[When Personalization Legitimizes Risks: Uncovering Safety Vulnerabilities in Personalized Dialogue Agents](https://aclanthology.org/2026.acl-long.1260/) (Guo et al., ACL 2026)
ACL
- Jiahe Guo, Xiangran Guo, Yulin Hu, Zimo Long, Xingyu Sui, Xuda Zhi, Yongbo Huang, Hao He, Weixiang Zhao, Yanyan Zhao, and Bing Qin. 2026. When Personalization Legitimizes Risks: Uncovering Safety Vulnerabilities in Personalized Dialogue Agents. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 27309–27335, San Diego, California, United States. Association for Computational Linguistics.