@inproceedings{hiebel-etal-2023-synthetic,
title = "Can Synthetic Text Help Clinical Named Entity Recognition? A Study of Electronic Health Records in {F}rench",
author = "Hiebel, Nicolas and
Ferret, Olivier and
Fort, Karen and
N{\'e}v{\'e}ol, Aur{\'e}lie",
booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.eacl-main.170",
doi = "10.18653/v1/2023.eacl-main.170",
pages = "2320--2338",
abstract = "In sensitive domains, the sharing of corpora is restricted due to confidentiality, copyrights or trade secrets. Automatic text generation can help alleviate these issues by producing synthetic texts that mimic the linguistic properties of real documents while preserving confidentiality. In this study, we assess the usability of synthetic corpus as a substitute training corpus for clinical information extraction. Our goal is to automatically produce a clinical case corpus annotated with clinical entities and to evaluate it for a named entity recognition (NER) task. We use two auto-regressive neural models partially or fully trained on generic French texts and fine-tuned on clinical cases to produce a corpus of synthetic clinical cases. We study variants of the generation process: (i) fine-tuning on annotated vs. plain text (in that case, annotations are obtained a posteriori) and (ii) selection of generated texts based on models parameters and filtering criteria. We then train NER models with the resulting synthetic text and evaluate them on a gold standard clinical corpus. Our experiments suggest that synthetic text is useful for clinical NER.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hiebel-etal-2023-synthetic">
<titleInfo>
<title>Can Synthetic Text Help Clinical Named Entity Recognition? A Study of Electronic Health Records in French</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicolas</namePart>
<namePart type="family">Hiebel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Olivier</namePart>
<namePart type="family">Ferret</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karen</namePart>
<namePart type="family">Fort</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aurélie</namePart>
<namePart type="family">Névéol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In sensitive domains, the sharing of corpora is restricted due to confidentiality, copyrights or trade secrets. Automatic text generation can help alleviate these issues by producing synthetic texts that mimic the linguistic properties of real documents while preserving confidentiality. In this study, we assess the usability of synthetic corpus as a substitute training corpus for clinical information extraction. Our goal is to automatically produce a clinical case corpus annotated with clinical entities and to evaluate it for a named entity recognition (NER) task. We use two auto-regressive neural models partially or fully trained on generic French texts and fine-tuned on clinical cases to produce a corpus of synthetic clinical cases. We study variants of the generation process: (i) fine-tuning on annotated vs. plain text (in that case, annotations are obtained a posteriori) and (ii) selection of generated texts based on models parameters and filtering criteria. We then train NER models with the resulting synthetic text and evaluate them on a gold standard clinical corpus. Our experiments suggest that synthetic text is useful for clinical NER.</abstract>
<identifier type="citekey">hiebel-etal-2023-synthetic</identifier>
<identifier type="doi">10.18653/v1/2023.eacl-main.170</identifier>
<location>
<url>https://aclanthology.org/2023.eacl-main.170</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>2320</start>
<end>2338</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Can Synthetic Text Help Clinical Named Entity Recognition? A Study of Electronic Health Records in French
%A Hiebel, Nicolas
%A Ferret, Olivier
%A Fort, Karen
%A Névéol, Aurélie
%S Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F hiebel-etal-2023-synthetic
%X In sensitive domains, the sharing of corpora is restricted due to confidentiality, copyrights or trade secrets. Automatic text generation can help alleviate these issues by producing synthetic texts that mimic the linguistic properties of real documents while preserving confidentiality. In this study, we assess the usability of synthetic corpus as a substitute training corpus for clinical information extraction. Our goal is to automatically produce a clinical case corpus annotated with clinical entities and to evaluate it for a named entity recognition (NER) task. We use two auto-regressive neural models partially or fully trained on generic French texts and fine-tuned on clinical cases to produce a corpus of synthetic clinical cases. We study variants of the generation process: (i) fine-tuning on annotated vs. plain text (in that case, annotations are obtained a posteriori) and (ii) selection of generated texts based on models parameters and filtering criteria. We then train NER models with the resulting synthetic text and evaluate them on a gold standard clinical corpus. Our experiments suggest that synthetic text is useful for clinical NER.
%R 10.18653/v1/2023.eacl-main.170
%U https://aclanthology.org/2023.eacl-main.170
%U https://doi.org/10.18653/v1/2023.eacl-main.170
%P 2320-2338
Markdown (Informal)
[Can Synthetic Text Help Clinical Named Entity Recognition? A Study of Electronic Health Records in French](https://aclanthology.org/2023.eacl-main.170) (Hiebel et al., EACL 2023)
ACL