@article{faustini-etal-2025-idt,
title = "{IDT}: Dual-Task Adversarial Rewriting for Attribute Anonymization",
author = "Faustini, Pedro and
Tonni, Shakila Mahjabin and
McIver, Annabelle and
Xu, Qiongkai and
Dras, Mark",
journal = "Computational Linguistics",
volume = "51",
number = "4",
month = dec,
year = "2025",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2025.cl-4.3/",
doi = "10.1162/coli.a.17",
pages = "1151--1189",
abstract = "Natural language processing (NLP) models may leak private information in different ways, including membership inference, reconstruction, or attribute inference attacks. Sensitive information may not be explicit in the text, but hidden in underlying writing characteristics. Methods to protect privacy can involve using representations inside models that are demonstrated not to detect sensitive attributes or{---}for instance, in cases where users might be at risk from an untrustworthy model, the sort of scenario of interest here{---}changing the raw text before models can have access to it. The goal is to rewrite text to prevent someone from inferring a sensitive attribute (e.g., the gender of the author, or their location by the writing style) while keeping the text useful for its original intention (e.g., the sentiment of a product review). The few works tackling this have focused on generative techniques. However, these often create extensively different texts from the original ones or face problems such as mode collapse. This article explores a novel adaptation of adversarial attack techniques to manipulate a text to deceive a classifier w.r.t. one task (privacy) while keeping the predictions of another classifier trained for another task (utility) unchanged. We propose IDT, a method that analyses predictions made by auxiliary and interpretable models to identify which tokens are important to change for the privacy task, and which ones should be kept for the utility task. We evaluate different datasets for NLP suitable for different tasks. Automatic and human evaluations show that IDT retains the utility of text, while also outperforming existing methods when deceiving a classifier w.r.t. a privacy task."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="faustini-etal-2025-idt">
<titleInfo>
<title>IDT: Dual-Task Adversarial Rewriting for Attribute Anonymization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pedro</namePart>
<namePart type="family">Faustini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shakila</namePart>
<namePart type="given">Mahjabin</namePart>
<namePart type="family">Tonni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Annabelle</namePart>
<namePart type="family">McIver</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qiongkai</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Dras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Natural language processing (NLP) models may leak private information in different ways, including membership inference, reconstruction, or attribute inference attacks. Sensitive information may not be explicit in the text, but hidden in underlying writing characteristics. Methods to protect privacy can involve using representations inside models that are demonstrated not to detect sensitive attributes or—for instance, in cases where users might be at risk from an untrustworthy model, the sort of scenario of interest here—changing the raw text before models can have access to it. The goal is to rewrite text to prevent someone from inferring a sensitive attribute (e.g., the gender of the author, or their location by the writing style) while keeping the text useful for its original intention (e.g., the sentiment of a product review). The few works tackling this have focused on generative techniques. However, these often create extensively different texts from the original ones or face problems such as mode collapse. This article explores a novel adaptation of adversarial attack techniques to manipulate a text to deceive a classifier w.r.t. one task (privacy) while keeping the predictions of another classifier trained for another task (utility) unchanged. We propose IDT, a method that analyses predictions made by auxiliary and interpretable models to identify which tokens are important to change for the privacy task, and which ones should be kept for the utility task. We evaluate different datasets for NLP suitable for different tasks. Automatic and human evaluations show that IDT retains the utility of text, while also outperforming existing methods when deceiving a classifier w.r.t. a privacy task.</abstract>
<identifier type="citekey">faustini-etal-2025-idt</identifier>
<identifier type="doi">10.1162/coli.a.17</identifier>
<location>
<url>https://aclanthology.org/2025.cl-4.3/</url>
</location>
<part>
<date>2025-12</date>
<detail type="volume"><number>51</number></detail>
<detail type="issue"><number>4</number></detail>
<extent unit="page">
<start>1151</start>
<end>1189</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T IDT: Dual-Task Adversarial Rewriting for Attribute Anonymization
%A Faustini, Pedro
%A Tonni, Shakila Mahjabin
%A McIver, Annabelle
%A Xu, Qiongkai
%A Dras, Mark
%J Computational Linguistics
%D 2025
%8 December
%V 51
%N 4
%I MIT Press
%C Cambridge, MA
%F faustini-etal-2025-idt
%X Natural language processing (NLP) models may leak private information in different ways, including membership inference, reconstruction, or attribute inference attacks. Sensitive information may not be explicit in the text, but hidden in underlying writing characteristics. Methods to protect privacy can involve using representations inside models that are demonstrated not to detect sensitive attributes or—for instance, in cases where users might be at risk from an untrustworthy model, the sort of scenario of interest here—changing the raw text before models can have access to it. The goal is to rewrite text to prevent someone from inferring a sensitive attribute (e.g., the gender of the author, or their location by the writing style) while keeping the text useful for its original intention (e.g., the sentiment of a product review). The few works tackling this have focused on generative techniques. However, these often create extensively different texts from the original ones or face problems such as mode collapse. This article explores a novel adaptation of adversarial attack techniques to manipulate a text to deceive a classifier w.r.t. one task (privacy) while keeping the predictions of another classifier trained for another task (utility) unchanged. We propose IDT, a method that analyses predictions made by auxiliary and interpretable models to identify which tokens are important to change for the privacy task, and which ones should be kept for the utility task. We evaluate different datasets for NLP suitable for different tasks. Automatic and human evaluations show that IDT retains the utility of text, while also outperforming existing methods when deceiving a classifier w.r.t. a privacy task.
%R 10.1162/coli.a.17
%U https://aclanthology.org/2025.cl-4.3/
%U https://doi.org/10.1162/coli.a.17
%P 1151-1189
Markdown (Informal)
[IDT: Dual-Task Adversarial Rewriting for Attribute Anonymization](https://aclanthology.org/2025.cl-4.3/) (Faustini et al., CL 2025)
ACL