@inproceedings{carandang-etal-2025-llms,
title = "Are {LLM}s reliable? An exploration of the reliability of large language models in clinical note generation",
author = "Carandang, Kristine Ann M. and
Arana, Jasper Meynard and
Casin, Ethan Robert and
Monterola, Christopher and
Tan, Daniel Stanley and
Valenzuela, Jesus Felix B. and
Alis, Christian",
editor = "Rehm, Georg and
Li, Yunyao",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-industry.99/",
doi = "10.18653/v1/2025.acl-industry.99",
pages = "1413--1422",
ISBN = "979-8-89176-288-6",
abstract = "Due to the legal and ethical responsibilities of healthcare providers (HCPs) for accurate documentation and protection of patient data privacy, the natural variability in the responses of large language models (LLMs) presents challenges for incorporating clinical note generation (CNG) systems, driven by LLMs, into real-world clinical processes. The complexity is further amplified by the detailed nature of texts in CNG. To enhance the confidence of HCPs in tools powered by LLMs, this study evaluates the reliability of 12 open-weight and proprietary LLMs from Anthropic, Meta, Mistral, and OpenAI in CNG in terms of their ability to generate notes that are string equivalent (consistency rate), have the same meaning (semantic consistency) and are correct (semantic similarity), across several iterations using the same prompt. The results show that (1) LLMs from all model families are stable, such that their responses are semantically consistent despite being written in various ways, and (2) most of the LLMs generated notes close to the corresponding notes made by experts. Overall, Meta{'}s Llama 70B was the most reliable, followed by Mistral{'}s Small model. With these findings, we recommend the local deployment of these relatively smaller open-weight models for CNG to ensure compliance with data privacy regulations, as well as to improve the efficiency of HCPs in clinical documentation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="carandang-etal-2025-llms">
<titleInfo>
<title>Are LLMs reliable? An exploration of the reliability of large language models in clinical note generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kristine</namePart>
<namePart type="given">Ann</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Carandang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jasper</namePart>
<namePart type="given">Meynard</namePart>
<namePart type="family">Arana</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ethan</namePart>
<namePart type="given">Robert</namePart>
<namePart type="family">Casin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Monterola</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="given">Stanley</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jesus</namePart>
<namePart type="given">Felix</namePart>
<namePart type="given">B</namePart>
<namePart type="family">Valenzuela</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Alis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Rehm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-288-6</identifier>
</relatedItem>
<abstract>Due to the legal and ethical responsibilities of healthcare providers (HCPs) for accurate documentation and protection of patient data privacy, the natural variability in the responses of large language models (LLMs) presents challenges for incorporating clinical note generation (CNG) systems, driven by LLMs, into real-world clinical processes. The complexity is further amplified by the detailed nature of texts in CNG. To enhance the confidence of HCPs in tools powered by LLMs, this study evaluates the reliability of 12 open-weight and proprietary LLMs from Anthropic, Meta, Mistral, and OpenAI in CNG in terms of their ability to generate notes that are string equivalent (consistency rate), have the same meaning (semantic consistency) and are correct (semantic similarity), across several iterations using the same prompt. The results show that (1) LLMs from all model families are stable, such that their responses are semantically consistent despite being written in various ways, and (2) most of the LLMs generated notes close to the corresponding notes made by experts. Overall, Meta’s Llama 70B was the most reliable, followed by Mistral’s Small model. With these findings, we recommend the local deployment of these relatively smaller open-weight models for CNG to ensure compliance with data privacy regulations, as well as to improve the efficiency of HCPs in clinical documentation.</abstract>
<identifier type="citekey">carandang-etal-2025-llms</identifier>
<identifier type="doi">10.18653/v1/2025.acl-industry.99</identifier>
<location>
<url>https://aclanthology.org/2025.acl-industry.99/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>1413</start>
<end>1422</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Are LLMs reliable? An exploration of the reliability of large language models in clinical note generation
%A Carandang, Kristine Ann M.
%A Arana, Jasper Meynard
%A Casin, Ethan Robert
%A Monterola, Christopher
%A Tan, Daniel Stanley
%A Valenzuela, Jesus Felix B.
%A Alis, Christian
%Y Rehm, Georg
%Y Li, Yunyao
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-288-6
%F carandang-etal-2025-llms
%X Due to the legal and ethical responsibilities of healthcare providers (HCPs) for accurate documentation and protection of patient data privacy, the natural variability in the responses of large language models (LLMs) presents challenges for incorporating clinical note generation (CNG) systems, driven by LLMs, into real-world clinical processes. The complexity is further amplified by the detailed nature of texts in CNG. To enhance the confidence of HCPs in tools powered by LLMs, this study evaluates the reliability of 12 open-weight and proprietary LLMs from Anthropic, Meta, Mistral, and OpenAI in CNG in terms of their ability to generate notes that are string equivalent (consistency rate), have the same meaning (semantic consistency) and are correct (semantic similarity), across several iterations using the same prompt. The results show that (1) LLMs from all model families are stable, such that their responses are semantically consistent despite being written in various ways, and (2) most of the LLMs generated notes close to the corresponding notes made by experts. Overall, Meta’s Llama 70B was the most reliable, followed by Mistral’s Small model. With these findings, we recommend the local deployment of these relatively smaller open-weight models for CNG to ensure compliance with data privacy regulations, as well as to improve the efficiency of HCPs in clinical documentation.
%R 10.18653/v1/2025.acl-industry.99
%U https://aclanthology.org/2025.acl-industry.99/
%U https://doi.org/10.18653/v1/2025.acl-industry.99
%P 1413-1422
Markdown (Informal)
[Are LLMs reliable? An exploration of the reliability of large language models in clinical note generation](https://aclanthology.org/2025.acl-industry.99/) (Carandang et al., ACL 2025)
ACL