@inproceedings{zaghouani-2026-linguistic,
title = "Linguistic Identity Leakage: When Language Reveals Identity in Anonymized Text",
author = "Zaghouani, Wajdi",
editor = "Habernal, Ivan and
Ghanavati, Sepideh and
Haghighi, Sara and
Ramesh, Krithika and
Igamberdiev, Timour and
Wilson, Shomir",
booktitle = "Proceedings of the Seventh Workshop on Privacy in Natural Language Processing",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.privatenlp-main.8/",
doi = "10.18653/v1/2026.privatenlp-main.8",
pages = "107--117",
ISBN = "979-8-89176-397-5",
abstract = "Privacy-preserving natural language processing (NLP) typically focuses on removing explicit identifiers such as names, addresses, and phone numbers. We argue that this approach overlooks a key risk: natural language itself encodes signals about a speaker{'}s geographic origin, social background, and community membership that persist after anonymization. We introduce Linguistic Identity Leakage (LIL), defined as the inference of personal or demographic attributes from linguistic features in text where explicit identifiers have been removed. We further introduce Linguistic Personally Identifiable Information (L-PII) to denote the linguistic features that enable such inference. Drawing on sociolinguistics, stylometry, and NLP privacy research, we propose a taxonomy of linguistic identity signals across five categories and examine implications for dataset release, language model training, and privacy auditing. Using examples from Arabic dialectal variation and other multilingual contexts, we present the \textbf{Identity Inference Risk} (IIR) framework for assessing residual privacy risk in NLP systems and discuss how contemporary LLMs amplify these risks. Our goal is to encourage broader recognition of the gap between conventional anonymization practices and the linguistic reality of natural language data."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zaghouani-2026-linguistic">
<titleInfo>
<title>Linguistic Identity Leakage: When Language Reveals Identity in Anonymized Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wajdi</namePart>
<namePart type="family">Zaghouani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh Workshop on Privacy in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Habernal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sepideh</namePart>
<namePart type="family">Ghanavati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Haghighi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Krithika</namePart>
<namePart type="family">Ramesh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Timour</namePart>
<namePart type="family">Igamberdiev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shomir</namePart>
<namePart type="family">Wilson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-397-5</identifier>
</relatedItem>
<abstract>Privacy-preserving natural language processing (NLP) typically focuses on removing explicit identifiers such as names, addresses, and phone numbers. We argue that this approach overlooks a key risk: natural language itself encodes signals about a speaker’s geographic origin, social background, and community membership that persist after anonymization. We introduce Linguistic Identity Leakage (LIL), defined as the inference of personal or demographic attributes from linguistic features in text where explicit identifiers have been removed. We further introduce Linguistic Personally Identifiable Information (L-PII) to denote the linguistic features that enable such inference. Drawing on sociolinguistics, stylometry, and NLP privacy research, we propose a taxonomy of linguistic identity signals across five categories and examine implications for dataset release, language model training, and privacy auditing. Using examples from Arabic dialectal variation and other multilingual contexts, we present the Identity Inference Risk (IIR) framework for assessing residual privacy risk in NLP systems and discuss how contemporary LLMs amplify these risks. Our goal is to encourage broader recognition of the gap between conventional anonymization practices and the linguistic reality of natural language data.</abstract>
<identifier type="citekey">zaghouani-2026-linguistic</identifier>
<identifier type="doi">10.18653/v1/2026.privatenlp-main.8</identifier>
<location>
<url>https://aclanthology.org/2026.privatenlp-main.8/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>107</start>
<end>117</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Linguistic Identity Leakage: When Language Reveals Identity in Anonymized Text
%A Zaghouani, Wajdi
%Y Habernal, Ivan
%Y Ghanavati, Sepideh
%Y Haghighi, Sara
%Y Ramesh, Krithika
%Y Igamberdiev, Timour
%Y Wilson, Shomir
%S Proceedings of the Seventh Workshop on Privacy in Natural Language Processing
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-397-5
%F zaghouani-2026-linguistic
%X Privacy-preserving natural language processing (NLP) typically focuses on removing explicit identifiers such as names, addresses, and phone numbers. We argue that this approach overlooks a key risk: natural language itself encodes signals about a speaker’s geographic origin, social background, and community membership that persist after anonymization. We introduce Linguistic Identity Leakage (LIL), defined as the inference of personal or demographic attributes from linguistic features in text where explicit identifiers have been removed. We further introduce Linguistic Personally Identifiable Information (L-PII) to denote the linguistic features that enable such inference. Drawing on sociolinguistics, stylometry, and NLP privacy research, we propose a taxonomy of linguistic identity signals across five categories and examine implications for dataset release, language model training, and privacy auditing. Using examples from Arabic dialectal variation and other multilingual contexts, we present the Identity Inference Risk (IIR) framework for assessing residual privacy risk in NLP systems and discuss how contemporary LLMs amplify these risks. Our goal is to encourage broader recognition of the gap between conventional anonymization practices and the linguistic reality of natural language data.
%R 10.18653/v1/2026.privatenlp-main.8
%U https://aclanthology.org/2026.privatenlp-main.8/
%U https://doi.org/10.18653/v1/2026.privatenlp-main.8
%P 107-117
Markdown (Informal)
[Linguistic Identity Leakage: When Language Reveals Identity in Anonymized Text](https://aclanthology.org/2026.privatenlp-main.8/) (Zaghouani, PrivateNLP 2026)
ACL