@inproceedings{meoni-etal-2024-generating,
title = "Generating Synthetic Documents with Clinical Keywords: A Privacy-Sensitive Methodology",
author = "Meoni, Simon and
De la Clergerie, {\'E}ric and
Ryffel, Th{\'e}o",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Thompson, Paul and
Ondov, Brian",
booktitle = "Proceedings of the First Workshop on Patient-Oriented Language Processing (CL4Health) @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.cl4health-1.14",
pages = "115--123",
abstract = "Electronic Health Records store valuable patient-staff interaction data. These notes, often unstructured to save healthcare personnel time, can be challenging to analyze manually. Proprietary online Large Language Models have demonstrated impressive results in analyzing EHR notes. However, Clinical NLP faces unique challenges due to the sensitive and specialized nature of the data. Sending patient information via external APIs poses privacy risks, and hospitals require customized NLP systems to align with their unique practices. To address these challenges, developing customized LLMs using specific training datasets is crucial. To address this, we propose generating synthetic training data using keywords extracted without confidential information. Furthermore, we introduce a reward mechanism that iteratively refines the quality of synthetic documents. This involves scoring synthetic candidates against real clinical reports using a semantic textual similarity score and performing an aligment step to align the model with its best-scored utterances.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="meoni-etal-2024-generating">
<titleInfo>
<title>Generating Synthetic Documents with Clinical Keywords: A Privacy-Sensitive Methodology</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Meoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Éric</namePart>
<namePart type="family">De la Clergerie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Théo</namePart>
<namePart type="family">Ryffel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Patient-Oriented Language Processing (CL4Health) @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Thompson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Ondov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Electronic Health Records store valuable patient-staff interaction data. These notes, often unstructured to save healthcare personnel time, can be challenging to analyze manually. Proprietary online Large Language Models have demonstrated impressive results in analyzing EHR notes. However, Clinical NLP faces unique challenges due to the sensitive and specialized nature of the data. Sending patient information via external APIs poses privacy risks, and hospitals require customized NLP systems to align with their unique practices. To address these challenges, developing customized LLMs using specific training datasets is crucial. To address this, we propose generating synthetic training data using keywords extracted without confidential information. Furthermore, we introduce a reward mechanism that iteratively refines the quality of synthetic documents. This involves scoring synthetic candidates against real clinical reports using a semantic textual similarity score and performing an aligment step to align the model with its best-scored utterances.</abstract>
<identifier type="citekey">meoni-etal-2024-generating</identifier>
<location>
<url>https://aclanthology.org/2024.cl4health-1.14</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>115</start>
<end>123</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Generating Synthetic Documents with Clinical Keywords: A Privacy-Sensitive Methodology
%A Meoni, Simon
%A De la Clergerie, Éric
%A Ryffel, Théo
%Y Demner-Fushman, Dina
%Y Ananiadou, Sophia
%Y Thompson, Paul
%Y Ondov, Brian
%S Proceedings of the First Workshop on Patient-Oriented Language Processing (CL4Health) @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F meoni-etal-2024-generating
%X Electronic Health Records store valuable patient-staff interaction data. These notes, often unstructured to save healthcare personnel time, can be challenging to analyze manually. Proprietary online Large Language Models have demonstrated impressive results in analyzing EHR notes. However, Clinical NLP faces unique challenges due to the sensitive and specialized nature of the data. Sending patient information via external APIs poses privacy risks, and hospitals require customized NLP systems to align with their unique practices. To address these challenges, developing customized LLMs using specific training datasets is crucial. To address this, we propose generating synthetic training data using keywords extracted without confidential information. Furthermore, we introduce a reward mechanism that iteratively refines the quality of synthetic documents. This involves scoring synthetic candidates against real clinical reports using a semantic textual similarity score and performing an aligment step to align the model with its best-scored utterances.
%U https://aclanthology.org/2024.cl4health-1.14
%P 115-123
Markdown (Informal)
[Generating Synthetic Documents with Clinical Keywords: A Privacy-Sensitive Methodology](https://aclanthology.org/2024.cl4health-1.14) (Meoni et al., CL4Health-WS 2024)
ACL