@inproceedings{awon-etal-2025-clusant,
title = "{C}lu{S}an{T}: Differentially Private and Semantically Coherent Text Sanitization",
author = "Awon, Ahmed Musa and
Lu, Yun and
Potka, Shera and
Thomo, Alex",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.187/",
doi = "10.18653/v1/2025.naacl-long.187",
pages = "3676--3693",
ISBN = "979-8-89176-189-6",
abstract = "We introduce CluSanT, a novel text sanitization framework based on Metric Local Differential Privacy (MLDP). Our framework consists of three components: token clustering, cluster embedding, and token sanitization. For the first, CluSanT employs Large Language Models (LLMs) to create{---}a set of potential substitute tokens which we meaningfully cluster. Then, we develop a parameterized cluster embedding that balances the trade-off between privacy and utility. Lastly, we propose a MLDP algorithm which sanitizes/substitutes sensitive tokens in a text with the help of our embedding. Notably, our MLDP-based framework can be tuned with parameters such that (1) existing state-of-the-art (SOTA) token sanitization algorithms can be described{---}and improved{---}via our framework with extremal values of our parameters, and (2) by varying our parameters, we allow for a whole spectrum of privacy-utility tradeoffs between the two SOTA. Our experiments demonstrate CluSanT{'}s balance between privacy and semantic coherence, highlighting its capability as a valuable framework for privacy-preserving text sanitization."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="awon-etal-2025-clusant">
<titleInfo>
<title>CluSanT: Differentially Private and Semantically Coherent Text Sanitization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="given">Musa</namePart>
<namePart type="family">Awon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shera</namePart>
<namePart type="family">Potka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Thomo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>We introduce CluSanT, a novel text sanitization framework based on Metric Local Differential Privacy (MLDP). Our framework consists of three components: token clustering, cluster embedding, and token sanitization. For the first, CluSanT employs Large Language Models (LLMs) to create—a set of potential substitute tokens which we meaningfully cluster. Then, we develop a parameterized cluster embedding that balances the trade-off between privacy and utility. Lastly, we propose a MLDP algorithm which sanitizes/substitutes sensitive tokens in a text with the help of our embedding. Notably, our MLDP-based framework can be tuned with parameters such that (1) existing state-of-the-art (SOTA) token sanitization algorithms can be described—and improved—via our framework with extremal values of our parameters, and (2) by varying our parameters, we allow for a whole spectrum of privacy-utility tradeoffs between the two SOTA. Our experiments demonstrate CluSanT’s balance between privacy and semantic coherence, highlighting its capability as a valuable framework for privacy-preserving text sanitization.</abstract>
<identifier type="citekey">awon-etal-2025-clusant</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.187</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.187/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>3676</start>
<end>3693</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CluSanT: Differentially Private and Semantically Coherent Text Sanitization
%A Awon, Ahmed Musa
%A Lu, Yun
%A Potka, Shera
%A Thomo, Alex
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F awon-etal-2025-clusant
%X We introduce CluSanT, a novel text sanitization framework based on Metric Local Differential Privacy (MLDP). Our framework consists of three components: token clustering, cluster embedding, and token sanitization. For the first, CluSanT employs Large Language Models (LLMs) to create—a set of potential substitute tokens which we meaningfully cluster. Then, we develop a parameterized cluster embedding that balances the trade-off between privacy and utility. Lastly, we propose a MLDP algorithm which sanitizes/substitutes sensitive tokens in a text with the help of our embedding. Notably, our MLDP-based framework can be tuned with parameters such that (1) existing state-of-the-art (SOTA) token sanitization algorithms can be described—and improved—via our framework with extremal values of our parameters, and (2) by varying our parameters, we allow for a whole spectrum of privacy-utility tradeoffs between the two SOTA. Our experiments demonstrate CluSanT’s balance between privacy and semantic coherence, highlighting its capability as a valuable framework for privacy-preserving text sanitization.
%R 10.18653/v1/2025.naacl-long.187
%U https://aclanthology.org/2025.naacl-long.187/
%U https://doi.org/10.18653/v1/2025.naacl-long.187
%P 3676-3693
Markdown (Informal)
[CluSanT: Differentially Private and Semantically Coherent Text Sanitization](https://aclanthology.org/2025.naacl-long.187/) (Awon et al., NAACL 2025)
ACL
- Ahmed Musa Awon, Yun Lu, Shera Potka, and Alex Thomo. 2025. CluSanT: Differentially Private and Semantically Coherent Text Sanitization. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 3676–3693, Albuquerque, New Mexico. Association for Computational Linguistics.