@inproceedings{dong-etal-2025-ive,
title = "``{I}{'}ve Decided to Leak'': Probing Internals Behind Prompt Leakage Intents",
author = "Dong, Jianshuo and
Zhang, Yutong and
Yan, Liu and
Zhong, Zhenyu and
Wei, Tao and
Xu, Ke and
Huang, Minlie and
Zhang, Chao and
Qiu, Han",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1082/",
pages = "21329--21359",
ISBN = "979-8-89176-332-6",
abstract = "Large language models (LLMs) exhibit prompt leakage vulnerabilities, where they may be coaxed into revealing system prompts embedded in LLM services, raising intellectual property and confidentiality concerns. An intriguing question arises: Do LLMs genuinely internalize prompt leakage intents in their hidden states before generating tokens? In this work, we use probing techniques to capture LLMs' intent-related internal representations and confirm that the answer is yes. We start by comprehensively inducing prompt leakage behaviors across diverse system prompts, attack queries, and decoding methods. We develop a hybrid labeling pipeline, enabling the identification of broader prompt leakage behaviors beyond mere verbatim leaks. Our results show that a simple linear probe can predict prompt leakage risks from pre-generation hidden states without generating any tokens. Across all tested models, linear probes consistently achieve 90{\%}+ AUROC, even when applied to new system prompts and attacks. Understanding the model internals behind prompt leakage drives practical applications, including intention-based detection of prompt leakage risks. Code is available at: https://github.com/jianshuod/Probing-leak-intents."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dong-etal-2025-ive">
<titleInfo>
<title>“I’ve Decided to Leak”: Probing Internals Behind Prompt Leakage Intents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jianshuo</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yutong</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liu</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhenyu</namePart>
<namePart type="family">Zhong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tao</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ke</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minlie</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chao</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Han</namePart>
<namePart type="family">Qiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Large language models (LLMs) exhibit prompt leakage vulnerabilities, where they may be coaxed into revealing system prompts embedded in LLM services, raising intellectual property and confidentiality concerns. An intriguing question arises: Do LLMs genuinely internalize prompt leakage intents in their hidden states before generating tokens? In this work, we use probing techniques to capture LLMs’ intent-related internal representations and confirm that the answer is yes. We start by comprehensively inducing prompt leakage behaviors across diverse system prompts, attack queries, and decoding methods. We develop a hybrid labeling pipeline, enabling the identification of broader prompt leakage behaviors beyond mere verbatim leaks. Our results show that a simple linear probe can predict prompt leakage risks from pre-generation hidden states without generating any tokens. Across all tested models, linear probes consistently achieve 90%+ AUROC, even when applied to new system prompts and attacks. Understanding the model internals behind prompt leakage drives practical applications, including intention-based detection of prompt leakage risks. Code is available at: https://github.com/jianshuod/Probing-leak-intents.</abstract>
<identifier type="citekey">dong-etal-2025-ive</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1082/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>21329</start>
<end>21359</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T “I’ve Decided to Leak”: Probing Internals Behind Prompt Leakage Intents
%A Dong, Jianshuo
%A Zhang, Yutong
%A Yan, Liu
%A Zhong, Zhenyu
%A Wei, Tao
%A Xu, Ke
%A Huang, Minlie
%A Zhang, Chao
%A Qiu, Han
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F dong-etal-2025-ive
%X Large language models (LLMs) exhibit prompt leakage vulnerabilities, where they may be coaxed into revealing system prompts embedded in LLM services, raising intellectual property and confidentiality concerns. An intriguing question arises: Do LLMs genuinely internalize prompt leakage intents in their hidden states before generating tokens? In this work, we use probing techniques to capture LLMs’ intent-related internal representations and confirm that the answer is yes. We start by comprehensively inducing prompt leakage behaviors across diverse system prompts, attack queries, and decoding methods. We develop a hybrid labeling pipeline, enabling the identification of broader prompt leakage behaviors beyond mere verbatim leaks. Our results show that a simple linear probe can predict prompt leakage risks from pre-generation hidden states without generating any tokens. Across all tested models, linear probes consistently achieve 90%+ AUROC, even when applied to new system prompts and attacks. Understanding the model internals behind prompt leakage drives practical applications, including intention-based detection of prompt leakage risks. Code is available at: https://github.com/jianshuod/Probing-leak-intents.
%U https://aclanthology.org/2025.emnlp-main.1082/
%P 21329-21359
Markdown (Informal)
[“I’ve Decided to Leak”: Probing Internals Behind Prompt Leakage Intents](https://aclanthology.org/2025.emnlp-main.1082/) (Dong et al., EMNLP 2025)
ACL
- Jianshuo Dong, Yutong Zhang, Liu Yan, Zhenyu Zhong, Tao Wei, Ke Xu, Minlie Huang, Chao Zhang, and Han Qiu. 2025. “I’ve Decided to Leak”: Probing Internals Behind Prompt Leakage Intents. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 21329–21359, Suzhou, China. Association for Computational Linguistics.