@inproceedings{berg-etal-2020-impact,
title = "The Impact of De-identification on Downstream Named Entity Recognition in Clinical Text",
author = "Berg, Hanna and
Henriksson, Aron and
Dalianis, Hercules",
editor = "Holderness, Eben and
Jimeno Yepes, Antonio and
Lavelli, Alberto and
Minard, Anne-Lyse and
Pustejovsky, James and
Rinaldi, Fabio",
booktitle = "Proceedings of the 11th International Workshop on Health Text Mining and Information Analysis",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.louhi-1.1",
doi = "10.18653/v1/2020.louhi-1.1",
pages = "1--11",
abstract = "The impact of de-identification on data quality and, in particular, utility for developing models for downstream tasks has been more thoroughly studied for structured data than for unstructured text. While previous studies indicate that text de-identification has a limited impact on models for downstream tasks, it remains unclear what the impact is with various levels and forms of de-identification, in particular concerning the trade-off between precision and recall. In this paper, the impact of de-identification is studied on downstream named entity recognition in Swedish clinical text. The results indicate that de-identification models with moderate to high precision lead to similar downstream performance, while low precision has a substantial negative impact. Furthermore, different strategies for concealing sensitive information affect performance to different degrees, ranging from pseudonymisation having a low impact to the removal of entire sentences with sensitive information having a high impact. This study indicates that it is possible to increase the recall of models for identifying sensitive information without negatively affecting the use of de-identified text data for training models for clinical named entity recognition; however, there is ultimately a trade-off between the level of de-identification and the subsequent utility of the data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="berg-etal-2020-impact">
<titleInfo>
<title>The Impact of De-identification on Downstream Named Entity Recognition in Clinical Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hanna</namePart>
<namePart type="family">Berg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aron</namePart>
<namePart type="family">Henriksson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hercules</namePart>
<namePart type="family">Dalianis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 11th International Workshop on Health Text Mining and Information Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eben</namePart>
<namePart type="family">Holderness</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Jimeno Yepes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alberto</namePart>
<namePart type="family">Lavelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anne-Lyse</namePart>
<namePart type="family">Minard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Pustejovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabio</namePart>
<namePart type="family">Rinaldi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The impact of de-identification on data quality and, in particular, utility for developing models for downstream tasks has been more thoroughly studied for structured data than for unstructured text. While previous studies indicate that text de-identification has a limited impact on models for downstream tasks, it remains unclear what the impact is with various levels and forms of de-identification, in particular concerning the trade-off between precision and recall. In this paper, the impact of de-identification is studied on downstream named entity recognition in Swedish clinical text. The results indicate that de-identification models with moderate to high precision lead to similar downstream performance, while low precision has a substantial negative impact. Furthermore, different strategies for concealing sensitive information affect performance to different degrees, ranging from pseudonymisation having a low impact to the removal of entire sentences with sensitive information having a high impact. This study indicates that it is possible to increase the recall of models for identifying sensitive information without negatively affecting the use of de-identified text data for training models for clinical named entity recognition; however, there is ultimately a trade-off between the level of de-identification and the subsequent utility of the data.</abstract>
<identifier type="citekey">berg-etal-2020-impact</identifier>
<identifier type="doi">10.18653/v1/2020.louhi-1.1</identifier>
<location>
<url>https://aclanthology.org/2020.louhi-1.1</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>1</start>
<end>11</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Impact of De-identification on Downstream Named Entity Recognition in Clinical Text
%A Berg, Hanna
%A Henriksson, Aron
%A Dalianis, Hercules
%Y Holderness, Eben
%Y Jimeno Yepes, Antonio
%Y Lavelli, Alberto
%Y Minard, Anne-Lyse
%Y Pustejovsky, James
%Y Rinaldi, Fabio
%S Proceedings of the 11th International Workshop on Health Text Mining and Information Analysis
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F berg-etal-2020-impact
%X The impact of de-identification on data quality and, in particular, utility for developing models for downstream tasks has been more thoroughly studied for structured data than for unstructured text. While previous studies indicate that text de-identification has a limited impact on models for downstream tasks, it remains unclear what the impact is with various levels and forms of de-identification, in particular concerning the trade-off between precision and recall. In this paper, the impact of de-identification is studied on downstream named entity recognition in Swedish clinical text. The results indicate that de-identification models with moderate to high precision lead to similar downstream performance, while low precision has a substantial negative impact. Furthermore, different strategies for concealing sensitive information affect performance to different degrees, ranging from pseudonymisation having a low impact to the removal of entire sentences with sensitive information having a high impact. This study indicates that it is possible to increase the recall of models for identifying sensitive information without negatively affecting the use of de-identified text data for training models for clinical named entity recognition; however, there is ultimately a trade-off between the level of de-identification and the subsequent utility of the data.
%R 10.18653/v1/2020.louhi-1.1
%U https://aclanthology.org/2020.louhi-1.1
%U https://doi.org/10.18653/v1/2020.louhi-1.1
%P 1-11
Markdown (Informal)
[The Impact of De-identification on Downstream Named Entity Recognition in Clinical Text](https://aclanthology.org/2020.louhi-1.1) (Berg et al., Louhi 2020)
ACL