@inproceedings{kiefer-etal-2025-instruction,
title = "Instruction-Tuning {LL}a{MA} for Synthetic Medical Note Generation in {S}wedish and {E}nglish",
author = "Kiefer, Lotta and
Alabi, Jesujoba and
Vakili, Thomas and
Dalianis, Hercules and
Klakow, Dietrich",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-1.65/",
pages = "557--566",
abstract = "The increasing capabilities of large language models (LLMs) have unlocked transformative potential for medical applications, but privacy constraints limit access to high-quality training data from electronic health records (EHRs). In response, we propose a framework to generate synthetic EHRs by instruction-tuning an LLM using descriptions of diagnosis codes. We show that this framework overcomes problems of prior approaches, such as diversity reduction and medical incoherence, while maintaining strong privacy protections. Utility was measured by training models to predict diagnosis codes for EHRs. Real data still has higher utility, but synthetic data approaches real data results with increasing dataset size. The differences in utility were most likely due to noise in the synthetic data. A user study involving medical professionals confirmed no significant loss in readability or medical coherence compared to the real HRs, even though inter-annotator agreement is low. These findings establish synthetic EHRs as a viable alternative for privacypreserving and scalable clinical NLP applications. We release our code on GitHub."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kiefer-etal-2025-instruction">
<titleInfo>
<title>Instruction-Tuning LLaMA for Synthetic Medical Note Generation in Swedish and English</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lotta</namePart>
<namePart type="family">Kiefer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jesujoba</namePart>
<namePart type="family">Alabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Vakili</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hercules</namePart>
<namePart type="family">Dalianis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dietrich</namePart>
<namePart type="family">Klakow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kunilovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Escribe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The increasing capabilities of large language models (LLMs) have unlocked transformative potential for medical applications, but privacy constraints limit access to high-quality training data from electronic health records (EHRs). In response, we propose a framework to generate synthetic EHRs by instruction-tuning an LLM using descriptions of diagnosis codes. We show that this framework overcomes problems of prior approaches, such as diversity reduction and medical incoherence, while maintaining strong privacy protections. Utility was measured by training models to predict diagnosis codes for EHRs. Real data still has higher utility, but synthetic data approaches real data results with increasing dataset size. The differences in utility were most likely due to noise in the synthetic data. A user study involving medical professionals confirmed no significant loss in readability or medical coherence compared to the real HRs, even though inter-annotator agreement is low. These findings establish synthetic EHRs as a viable alternative for privacypreserving and scalable clinical NLP applications. We release our code on GitHub.</abstract>
<identifier type="citekey">kiefer-etal-2025-instruction</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-1.65/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>557</start>
<end>566</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Instruction-Tuning LLaMA for Synthetic Medical Note Generation in Swedish and English
%A Kiefer, Lotta
%A Alabi, Jesujoba
%A Vakili, Thomas
%A Dalianis, Hercules
%A Klakow, Dietrich
%Y Angelova, Galia
%Y Kunilovskaya, Maria
%Y Escribe, Marie
%Y Mitkov, Ruslan
%S Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F kiefer-etal-2025-instruction
%X The increasing capabilities of large language models (LLMs) have unlocked transformative potential for medical applications, but privacy constraints limit access to high-quality training data from electronic health records (EHRs). In response, we propose a framework to generate synthetic EHRs by instruction-tuning an LLM using descriptions of diagnosis codes. We show that this framework overcomes problems of prior approaches, such as diversity reduction and medical incoherence, while maintaining strong privacy protections. Utility was measured by training models to predict diagnosis codes for EHRs. Real data still has higher utility, but synthetic data approaches real data results with increasing dataset size. The differences in utility were most likely due to noise in the synthetic data. A user study involving medical professionals confirmed no significant loss in readability or medical coherence compared to the real HRs, even though inter-annotator agreement is low. These findings establish synthetic EHRs as a viable alternative for privacypreserving and scalable clinical NLP applications. We release our code on GitHub.
%U https://aclanthology.org/2025.ranlp-1.65/
%P 557-566
Markdown (Informal)
[Instruction-Tuning LLaMA for Synthetic Medical Note Generation in Swedish and English](https://aclanthology.org/2025.ranlp-1.65/) (Kiefer et al., RANLP 2025)
ACL