@inproceedings{ahrend-etal-2026-safer,
title = "Safer Reasoning Traces: Measuring and Mitigating Chain-of-Thought Leakage in {LLM}s",
author = "Ahrend, Patrick and
Eder, Tobias and
Yang, Xiyang and
Pan, Zhiyi and
Groh, Georg",
editor = "Habernal, Ivan and
Ghanavati, Sepideh and
Haghighi, Sara and
Ramesh, Krithika and
Igamberdiev, Timour and
Wilson, Shomir",
booktitle = "Proceedings of the Seventh Workshop on Privacy in Natural Language Processing",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.privatenlp-main.10/",
doi = "10.18653/v1/2026.privatenlp-main.10",
pages = "140--164",
ISBN = "979-8-89176-397-5",
abstract = "Chain-of-Thought (CoT) prompting improves LLM reasoning but can increase privacy risk by resurfacing personally identifiable information (PII) from the prompt into reasoning traces and outputs, even under policies that instruct the model not to restate PII. We study such direct, inference-time PII leakage using a model-agnostic framework that (i) defines leakage as risk-weighted, token-level events across 11 PII types, (ii) traces leakage curves as a function of the allowed CoT budget, and (iii) compares open- and closed-source model families on a structured PII dataset with a hierarchical risk taxonomy. We find that CoT consistently elevates leakage, especially for high-risk categories, and that leakage is strongly family- and budget-dependent: increasing the reasoning budget can either amplify or attenuate leakage depending on the base model. We then benchmark lightweight inference-time gatekeepers: a rule-based detector, a TF{--}IDF + logistic regression classifier, a GLiNER-based NER model, and an LLM-as-judge, using risk-weighted F1, Macro-F1, and recall. No single method dominates across models or budgets, motivating hybrid, style-adaptive gatekeeping policies that balance utility and risk under a common, reproducible protocol."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ahrend-etal-2026-safer">
<titleInfo>
<title>Safer Reasoning Traces: Measuring and Mitigating Chain-of-Thought Leakage in LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Ahrend</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tobias</namePart>
<namePart type="family">Eder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiyang</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyi</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Groh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh Workshop on Privacy in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Habernal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sepideh</namePart>
<namePart type="family">Ghanavati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Haghighi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Krithika</namePart>
<namePart type="family">Ramesh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Timour</namePart>
<namePart type="family">Igamberdiev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shomir</namePart>
<namePart type="family">Wilson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-397-5</identifier>
</relatedItem>
<abstract>Chain-of-Thought (CoT) prompting improves LLM reasoning but can increase privacy risk by resurfacing personally identifiable information (PII) from the prompt into reasoning traces and outputs, even under policies that instruct the model not to restate PII. We study such direct, inference-time PII leakage using a model-agnostic framework that (i) defines leakage as risk-weighted, token-level events across 11 PII types, (ii) traces leakage curves as a function of the allowed CoT budget, and (iii) compares open- and closed-source model families on a structured PII dataset with a hierarchical risk taxonomy. We find that CoT consistently elevates leakage, especially for high-risk categories, and that leakage is strongly family- and budget-dependent: increasing the reasoning budget can either amplify or attenuate leakage depending on the base model. We then benchmark lightweight inference-time gatekeepers: a rule-based detector, a TF–IDF + logistic regression classifier, a GLiNER-based NER model, and an LLM-as-judge, using risk-weighted F1, Macro-F1, and recall. No single method dominates across models or budgets, motivating hybrid, style-adaptive gatekeeping policies that balance utility and risk under a common, reproducible protocol.</abstract>
<identifier type="citekey">ahrend-etal-2026-safer</identifier>
<identifier type="doi">10.18653/v1/2026.privatenlp-main.10</identifier>
<location>
<url>https://aclanthology.org/2026.privatenlp-main.10/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>140</start>
<end>164</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Safer Reasoning Traces: Measuring and Mitigating Chain-of-Thought Leakage in LLMs
%A Ahrend, Patrick
%A Eder, Tobias
%A Yang, Xiyang
%A Pan, Zhiyi
%A Groh, Georg
%Y Habernal, Ivan
%Y Ghanavati, Sepideh
%Y Haghighi, Sara
%Y Ramesh, Krithika
%Y Igamberdiev, Timour
%Y Wilson, Shomir
%S Proceedings of the Seventh Workshop on Privacy in Natural Language Processing
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-397-5
%F ahrend-etal-2026-safer
%X Chain-of-Thought (CoT) prompting improves LLM reasoning but can increase privacy risk by resurfacing personally identifiable information (PII) from the prompt into reasoning traces and outputs, even under policies that instruct the model not to restate PII. We study such direct, inference-time PII leakage using a model-agnostic framework that (i) defines leakage as risk-weighted, token-level events across 11 PII types, (ii) traces leakage curves as a function of the allowed CoT budget, and (iii) compares open- and closed-source model families on a structured PII dataset with a hierarchical risk taxonomy. We find that CoT consistently elevates leakage, especially for high-risk categories, and that leakage is strongly family- and budget-dependent: increasing the reasoning budget can either amplify or attenuate leakage depending on the base model. We then benchmark lightweight inference-time gatekeepers: a rule-based detector, a TF–IDF + logistic regression classifier, a GLiNER-based NER model, and an LLM-as-judge, using risk-weighted F1, Macro-F1, and recall. No single method dominates across models or budgets, motivating hybrid, style-adaptive gatekeeping policies that balance utility and risk under a common, reproducible protocol.
%R 10.18653/v1/2026.privatenlp-main.10
%U https://aclanthology.org/2026.privatenlp-main.10/
%U https://doi.org/10.18653/v1/2026.privatenlp-main.10
%P 140-164
Markdown (Informal)
[Safer Reasoning Traces: Measuring and Mitigating Chain-of-Thought Leakage in LLMs](https://aclanthology.org/2026.privatenlp-main.10/) (Ahrend et al., PrivateNLP 2026)
ACL