@inproceedings{kim-etal-2024-generalizing,
title = "Generalizing Clinical De-identification Models by Privacy-safe Data Augmentation using {GPT}-4",
author = "Kim, Woojin and
Hahm, Sungeun and
Lee, Jaejin",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1181",
pages = "21204--21218",
abstract = "De-identification (de-ID) refers to removing the association between a set of identifying data and the data subject. In clinical data management, the de-ID of Protected Health Information (PHI) is critical for patient confidentiality. However, state-of-the-art de-ID models show poor generalization on a new dataset. This is due to the difficulty of retaining training corpora. Additionally, labeling standards and the formats of patient records vary across different institutions. Our study addresses these issues by exploiting GPT-4 for data augmentation through one-shot and zero-shot prompts. Our approach effectively circumvents the problem of PHI leakage, ensuring privacy by redacting PHI before processing. To evaluate the effectiveness of our proposal, we conduct cross-dataset testing. The experimental result demonstrates significant improvements across three types of F1 scores.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kim-etal-2024-generalizing">
<titleInfo>
<title>Generalizing Clinical De-identification Models by Privacy-safe Data Augmentation using GPT-4</title>
</titleInfo>
<name type="personal">
<namePart type="given">Woojin</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sungeun</namePart>
<namePart type="family">Hahm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaejin</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>De-identification (de-ID) refers to removing the association between a set of identifying data and the data subject. In clinical data management, the de-ID of Protected Health Information (PHI) is critical for patient confidentiality. However, state-of-the-art de-ID models show poor generalization on a new dataset. This is due to the difficulty of retaining training corpora. Additionally, labeling standards and the formats of patient records vary across different institutions. Our study addresses these issues by exploiting GPT-4 for data augmentation through one-shot and zero-shot prompts. Our approach effectively circumvents the problem of PHI leakage, ensuring privacy by redacting PHI before processing. To evaluate the effectiveness of our proposal, we conduct cross-dataset testing. The experimental result demonstrates significant improvements across three types of F1 scores.</abstract>
<identifier type="citekey">kim-etal-2024-generalizing</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1181</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>21204</start>
<end>21218</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Generalizing Clinical De-identification Models by Privacy-safe Data Augmentation using GPT-4
%A Kim, Woojin
%A Hahm, Sungeun
%A Lee, Jaejin
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F kim-etal-2024-generalizing
%X De-identification (de-ID) refers to removing the association between a set of identifying data and the data subject. In clinical data management, the de-ID of Protected Health Information (PHI) is critical for patient confidentiality. However, state-of-the-art de-ID models show poor generalization on a new dataset. This is due to the difficulty of retaining training corpora. Additionally, labeling standards and the formats of patient records vary across different institutions. Our study addresses these issues by exploiting GPT-4 for data augmentation through one-shot and zero-shot prompts. Our approach effectively circumvents the problem of PHI leakage, ensuring privacy by redacting PHI before processing. To evaluate the effectiveness of our proposal, we conduct cross-dataset testing. The experimental result demonstrates significant improvements across three types of F1 scores.
%U https://aclanthology.org/2024.emnlp-main.1181
%P 21204-21218
Markdown (Informal)
[Generalizing Clinical De-identification Models by Privacy-safe Data Augmentation using GPT-4](https://aclanthology.org/2024.emnlp-main.1181) (Kim et al., EMNLP 2024)
ACL