@inproceedings{frikha-etal-2025-privacyscalpel,
title = "{P}rivacy{S}calpel: Enhancing {LLM} Privacy via Interpretable Feature Intervention with Sparse Autoencoders",
author = "Frikha, Ahmed and
Razi, Muhammad Reza Ar and
Nakka, Krishna Kanth and
Mendes, Ricardo and
Jiang, Xue and
Zhou, Xuebing",
editor = "Belinkov, Yonatan and
Mueller, Aaron and
Kim, Najoung and
Mohebbi, Hosein and
Chen, Hanjie and
Arad, Dana and
Sarti, Gabriele",
booktitle = "Proceedings of the 8th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.blackboxnlp-1.13/",
pages = "226--238",
ISBN = "979-8-89176-346-3",
abstract = "Large Language Models (LLMs) achieve impressive natural language processing performance but can memorize and leak Personally Identifiable Information (PII), posing serious privacy risks. Existing mitigation strategies{---}such as differential privacy and neuron-level interventions{---}often degrade utility or fail to reliably prevent leakage. We present PrivacyScalpel, a privacy-preserving framework that leverages LLM interpretability to identify and suppress PII leakage while preserving performance. PrivacyScalpel operates in three stages: (1) Feature Probing to locate model layers encoding PII-rich representations; (2) Sparse Autoencoding using a k-Sparse Autoencoder (k-SAE) to disentangle and isolate privacy-sensitive features; and (3) Feature-Level Interventions via targeted ablation and vector steering to reduce leakage. Experiments on Gemma2-2B and Llama2-7B fine-tuned with the Enron dataset show that PrivacyScalpel reduces email leakage from 5.15{\%} to 0.0{\%} while retaining over 99.4{\%} of the original utility. Compared to neuron-level methods, our approach achieves a superior privacy{--}utility trade-off, highlighting the effectiveness of targeting sparse, monosemantic features over polysemantic neurons. Beyond privacy gains, PrivacyScalpel offers interpretability insights into PII memorization mechanisms, contributing to safer and more transparent LLM deployment."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="frikha-etal-2025-privacyscalpel">
<titleInfo>
<title>PrivacyScalpel: Enhancing LLM Privacy via Interpretable Feature Intervention with Sparse Autoencoders</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Frikha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="given">Reza</namePart>
<namePart type="given">Ar</namePart>
<namePart type="family">Razi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Krishna</namePart>
<namePart type="given">Kanth</namePart>
<namePart type="family">Nakka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ricardo</namePart>
<namePart type="family">Mendes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xue</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuebing</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 8th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Belinkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aaron</namePart>
<namePart type="family">Mueller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Najoung</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hosein</namePart>
<namePart type="family">Mohebbi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanjie</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dana</namePart>
<namePart type="family">Arad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriele</namePart>
<namePart type="family">Sarti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-346-3</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) achieve impressive natural language processing performance but can memorize and leak Personally Identifiable Information (PII), posing serious privacy risks. Existing mitigation strategies—such as differential privacy and neuron-level interventions—often degrade utility or fail to reliably prevent leakage. We present PrivacyScalpel, a privacy-preserving framework that leverages LLM interpretability to identify and suppress PII leakage while preserving performance. PrivacyScalpel operates in three stages: (1) Feature Probing to locate model layers encoding PII-rich representations; (2) Sparse Autoencoding using a k-Sparse Autoencoder (k-SAE) to disentangle and isolate privacy-sensitive features; and (3) Feature-Level Interventions via targeted ablation and vector steering to reduce leakage. Experiments on Gemma2-2B and Llama2-7B fine-tuned with the Enron dataset show that PrivacyScalpel reduces email leakage from 5.15% to 0.0% while retaining over 99.4% of the original utility. Compared to neuron-level methods, our approach achieves a superior privacy–utility trade-off, highlighting the effectiveness of targeting sparse, monosemantic features over polysemantic neurons. Beyond privacy gains, PrivacyScalpel offers interpretability insights into PII memorization mechanisms, contributing to safer and more transparent LLM deployment.</abstract>
<identifier type="citekey">frikha-etal-2025-privacyscalpel</identifier>
<location>
<url>https://aclanthology.org/2025.blackboxnlp-1.13/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>226</start>
<end>238</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PrivacyScalpel: Enhancing LLM Privacy via Interpretable Feature Intervention with Sparse Autoencoders
%A Frikha, Ahmed
%A Razi, Muhammad Reza Ar
%A Nakka, Krishna Kanth
%A Mendes, Ricardo
%A Jiang, Xue
%A Zhou, Xuebing
%Y Belinkov, Yonatan
%Y Mueller, Aaron
%Y Kim, Najoung
%Y Mohebbi, Hosein
%Y Chen, Hanjie
%Y Arad, Dana
%Y Sarti, Gabriele
%S Proceedings of the 8th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-346-3
%F frikha-etal-2025-privacyscalpel
%X Large Language Models (LLMs) achieve impressive natural language processing performance but can memorize and leak Personally Identifiable Information (PII), posing serious privacy risks. Existing mitigation strategies—such as differential privacy and neuron-level interventions—often degrade utility or fail to reliably prevent leakage. We present PrivacyScalpel, a privacy-preserving framework that leverages LLM interpretability to identify and suppress PII leakage while preserving performance. PrivacyScalpel operates in three stages: (1) Feature Probing to locate model layers encoding PII-rich representations; (2) Sparse Autoencoding using a k-Sparse Autoencoder (k-SAE) to disentangle and isolate privacy-sensitive features; and (3) Feature-Level Interventions via targeted ablation and vector steering to reduce leakage. Experiments on Gemma2-2B and Llama2-7B fine-tuned with the Enron dataset show that PrivacyScalpel reduces email leakage from 5.15% to 0.0% while retaining over 99.4% of the original utility. Compared to neuron-level methods, our approach achieves a superior privacy–utility trade-off, highlighting the effectiveness of targeting sparse, monosemantic features over polysemantic neurons. Beyond privacy gains, PrivacyScalpel offers interpretability insights into PII memorization mechanisms, contributing to safer and more transparent LLM deployment.
%U https://aclanthology.org/2025.blackboxnlp-1.13/
%P 226-238
Markdown (Informal)
[PrivacyScalpel: Enhancing LLM Privacy via Interpretable Feature Intervention with Sparse Autoencoders](https://aclanthology.org/2025.blackboxnlp-1.13/) (Frikha et al., BlackboxNLP 2025)
ACL