@inproceedings{larson-etal-2024-de,
title = "De-Identification of Sensitive Personal Data in Datasets Derived from {IIT}-{CDIP}",
author = "Larson, Stefan and
Lima, Nicole Cornehl and
Diaz, Santiago Pedroza and
Joshi, Amogh Manoj and
Betala, Siddharth and
Suleiman, Jamiu Tunde and
Mathur, Yash and
Prajapati, Kaushal Kumar and
Alakraa, Ramla and
Shen, Junjie and
Okotore, Temi and
Leach, Kevin",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1198",
doi = "10.18653/v1/2024.emnlp-main.1198",
pages = "21494--21505",
abstract = "The IIT-CDIP document collection is the source of several widely used and publicly accessible document understanding datasets. In this paper, manual inspection of 5 datasets derived from IIT-CDIP uncovers the presence of thousands of instances of sensitive personal data, including US Social Security Numbers (SSNs), birth places and dates, and home addresses of individuals. The presence of such sensitive personal data in commonly-used and publicly available datasets is startling and has ethical and potentially legal implications; we believe such sensitive data ought to be removed from the internet. Thus, in this paper, we develop a modular data de-identification pipeline that replaces sensitive data with synthetic, but realistic, data. Via experiments, we demonstrate that this de-identification method preserves the utility of the de-identified documents so that they can continue be used in various document understanding applications. We will release redacted versions of these datasets publicly.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="larson-etal-2024-de">
<titleInfo>
<title>De-Identification of Sensitive Personal Data in Datasets Derived from IIT-CDIP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Stefan</namePart>
<namePart type="family">Larson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicole</namePart>
<namePart type="given">Cornehl</namePart>
<namePart type="family">Lima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Santiago</namePart>
<namePart type="given">Pedroza</namePart>
<namePart type="family">Diaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amogh</namePart>
<namePart type="given">Manoj</namePart>
<namePart type="family">Joshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siddharth</namePart>
<namePart type="family">Betala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jamiu</namePart>
<namePart type="given">Tunde</namePart>
<namePart type="family">Suleiman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yash</namePart>
<namePart type="family">Mathur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaushal</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Prajapati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramla</namePart>
<namePart type="family">Alakraa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junjie</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Temi</namePart>
<namePart type="family">Okotore</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Leach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The IIT-CDIP document collection is the source of several widely used and publicly accessible document understanding datasets. In this paper, manual inspection of 5 datasets derived from IIT-CDIP uncovers the presence of thousands of instances of sensitive personal data, including US Social Security Numbers (SSNs), birth places and dates, and home addresses of individuals. The presence of such sensitive personal data in commonly-used and publicly available datasets is startling and has ethical and potentially legal implications; we believe such sensitive data ought to be removed from the internet. Thus, in this paper, we develop a modular data de-identification pipeline that replaces sensitive data with synthetic, but realistic, data. Via experiments, we demonstrate that this de-identification method preserves the utility of the de-identified documents so that they can continue be used in various document understanding applications. We will release redacted versions of these datasets publicly.</abstract>
<identifier type="citekey">larson-etal-2024-de</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.1198</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1198</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>21494</start>
<end>21505</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T De-Identification of Sensitive Personal Data in Datasets Derived from IIT-CDIP
%A Larson, Stefan
%A Lima, Nicole Cornehl
%A Diaz, Santiago Pedroza
%A Joshi, Amogh Manoj
%A Betala, Siddharth
%A Suleiman, Jamiu Tunde
%A Mathur, Yash
%A Prajapati, Kaushal Kumar
%A Alakraa, Ramla
%A Shen, Junjie
%A Okotore, Temi
%A Leach, Kevin
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F larson-etal-2024-de
%X The IIT-CDIP document collection is the source of several widely used and publicly accessible document understanding datasets. In this paper, manual inspection of 5 datasets derived from IIT-CDIP uncovers the presence of thousands of instances of sensitive personal data, including US Social Security Numbers (SSNs), birth places and dates, and home addresses of individuals. The presence of such sensitive personal data in commonly-used and publicly available datasets is startling and has ethical and potentially legal implications; we believe such sensitive data ought to be removed from the internet. Thus, in this paper, we develop a modular data de-identification pipeline that replaces sensitive data with synthetic, but realistic, data. Via experiments, we demonstrate that this de-identification method preserves the utility of the de-identified documents so that they can continue be used in various document understanding applications. We will release redacted versions of these datasets publicly.
%R 10.18653/v1/2024.emnlp-main.1198
%U https://aclanthology.org/2024.emnlp-main.1198
%U https://doi.org/10.18653/v1/2024.emnlp-main.1198
%P 21494-21505
Markdown (Informal)
[De-Identification of Sensitive Personal Data in Datasets Derived from IIT-CDIP](https://aclanthology.org/2024.emnlp-main.1198) (Larson et al., EMNLP 2024)
ACL
- Stefan Larson, Nicole Cornehl Lima, Santiago Pedroza Diaz, Amogh Manoj Joshi, Siddharth Betala, Jamiu Tunde Suleiman, Yash Mathur, Kaushal Kumar Prajapati, Ramla Alakraa, Junjie Shen, Temi Okotore, and Kevin Leach. 2024. De-Identification of Sensitive Personal Data in Datasets Derived from IIT-CDIP. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 21494–21505, Miami, Florida, USA. Association for Computational Linguistics.