@inproceedings{morris-etal-2022-unsupervised,
title = "Unsupervised Text Deidentification",
author = "Morris, John and
Chiu, Justin and
Zabih, Ramin and
Rush, Alexander",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.352",
doi = "10.18653/v1/2022.findings-emnlp.352",
pages = "4777--4788",
abstract = "Deidentification seeks to anonymize textual data prior to distribution. Automatic deidentification primarily uses supervised named entity recognition from human-labeled data points. We propose an unsupervised deidentification method that masks words that leak personally-identifying information. The approach utilizes a specially trained reidentification model to identify individuals from redacted personal documents. Motivated by K-anonymity based privacy, we generate redactions that ensure a minimum reidentification rank for the correct profile of the document. To evaluate this approach, we consider the task of deidentifying Wikipedia Biographies, and evaluate using an adversarial reidentification metric. Compared to a set of unsupervised baselines, our approach deidentifies documents more completely while removing fewer words. Qualitatively, we see that the approach eliminates many identifying aspects that would fall outside of the common named entity based approach.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="morris-etal-2022-unsupervised">
<titleInfo>
<title>Unsupervised Text Deidentification</title>
</titleInfo>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Morris</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Justin</namePart>
<namePart type="family">Chiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramin</namePart>
<namePart type="family">Zabih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Rush</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Deidentification seeks to anonymize textual data prior to distribution. Automatic deidentification primarily uses supervised named entity recognition from human-labeled data points. We propose an unsupervised deidentification method that masks words that leak personally-identifying information. The approach utilizes a specially trained reidentification model to identify individuals from redacted personal documents. Motivated by K-anonymity based privacy, we generate redactions that ensure a minimum reidentification rank for the correct profile of the document. To evaluate this approach, we consider the task of deidentifying Wikipedia Biographies, and evaluate using an adversarial reidentification metric. Compared to a set of unsupervised baselines, our approach deidentifies documents more completely while removing fewer words. Qualitatively, we see that the approach eliminates many identifying aspects that would fall outside of the common named entity based approach.</abstract>
<identifier type="citekey">morris-etal-2022-unsupervised</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.352</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.352</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>4777</start>
<end>4788</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Unsupervised Text Deidentification
%A Morris, John
%A Chiu, Justin
%A Zabih, Ramin
%A Rush, Alexander
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F morris-etal-2022-unsupervised
%X Deidentification seeks to anonymize textual data prior to distribution. Automatic deidentification primarily uses supervised named entity recognition from human-labeled data points. We propose an unsupervised deidentification method that masks words that leak personally-identifying information. The approach utilizes a specially trained reidentification model to identify individuals from redacted personal documents. Motivated by K-anonymity based privacy, we generate redactions that ensure a minimum reidentification rank for the correct profile of the document. To evaluate this approach, we consider the task of deidentifying Wikipedia Biographies, and evaluate using an adversarial reidentification metric. Compared to a set of unsupervised baselines, our approach deidentifies documents more completely while removing fewer words. Qualitatively, we see that the approach eliminates many identifying aspects that would fall outside of the common named entity based approach.
%R 10.18653/v1/2022.findings-emnlp.352
%U https://aclanthology.org/2022.findings-emnlp.352
%U https://doi.org/10.18653/v1/2022.findings-emnlp.352
%P 4777-4788
Markdown (Informal)
[Unsupervised Text Deidentification](https://aclanthology.org/2022.findings-emnlp.352) (Morris et al., Findings 2022)
ACL
- John Morris, Justin Chiu, Ramin Zabih, and Alexander Rush. 2022. Unsupervised Text Deidentification. In Findings of the Association for Computational Linguistics: EMNLP 2022, pages 4777–4788, Abu Dhabi, United Arab Emirates. Association for Computational Linguistics.