@inproceedings{chen-etal-2025-uncertainty,
title = "Uncertainty Quantification for Clinical Outcome Predictions with (Large) Language Models",
author = "Chen, Zizhang and
Li, Peizhao and
Dong, Xiaomeng and
Hong, Pengyu",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.419/",
doi = "10.18653/v1/2025.findings-naacl.419",
pages = "7512--7523",
ISBN = "979-8-89176-195-7",
abstract = "To facilitate healthcare delivery, language models (LMs) have significant potential for clinical prediction tasks using electronic health records (EHRs). However, in these high-stakes applications, unreliable decisions can result in significant costs due to compromised patient safety and ethical concerns, thus increasing the need for good uncertainty modelling of automated clinical predictions. To address this, we consider uncertainty quantification of LMs for EHR tasks in both white-box and black-box settings. We first quantify uncertainty in white-box models, where we have access to model parameters and output logits. We show that an effective reduction of model uncertainty can be achieved by using the proposed multi-tasking and ensemble methods in EHRs. Continuing with this idea, we extend our approach to black-box settings, including popular proprietary LMs such as GPT-4. We validate our framework using longitudinal clinical data from over 6,000 patients across ten clinical prediction tasks. Results show that ensembling methods and multi-task prediction prompts reduce uncertainty across different scenarios. These findings increase model transparency in white-box and black-box settings, thereby advancing reliable AI healthcare."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2025-uncertainty">
<titleInfo>
<title>Uncertainty Quantification for Clinical Outcome Predictions with (Large) Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zizhang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peizhao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaomeng</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pengyu</namePart>
<namePart type="family">Hong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>To facilitate healthcare delivery, language models (LMs) have significant potential for clinical prediction tasks using electronic health records (EHRs). However, in these high-stakes applications, unreliable decisions can result in significant costs due to compromised patient safety and ethical concerns, thus increasing the need for good uncertainty modelling of automated clinical predictions. To address this, we consider uncertainty quantification of LMs for EHR tasks in both white-box and black-box settings. We first quantify uncertainty in white-box models, where we have access to model parameters and output logits. We show that an effective reduction of model uncertainty can be achieved by using the proposed multi-tasking and ensemble methods in EHRs. Continuing with this idea, we extend our approach to black-box settings, including popular proprietary LMs such as GPT-4. We validate our framework using longitudinal clinical data from over 6,000 patients across ten clinical prediction tasks. Results show that ensembling methods and multi-task prediction prompts reduce uncertainty across different scenarios. These findings increase model transparency in white-box and black-box settings, thereby advancing reliable AI healthcare.</abstract>
<identifier type="citekey">chen-etal-2025-uncertainty</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.419</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.419/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>7512</start>
<end>7523</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Uncertainty Quantification for Clinical Outcome Predictions with (Large) Language Models
%A Chen, Zizhang
%A Li, Peizhao
%A Dong, Xiaomeng
%A Hong, Pengyu
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F chen-etal-2025-uncertainty
%X To facilitate healthcare delivery, language models (LMs) have significant potential for clinical prediction tasks using electronic health records (EHRs). However, in these high-stakes applications, unreliable decisions can result in significant costs due to compromised patient safety and ethical concerns, thus increasing the need for good uncertainty modelling of automated clinical predictions. To address this, we consider uncertainty quantification of LMs for EHR tasks in both white-box and black-box settings. We first quantify uncertainty in white-box models, where we have access to model parameters and output logits. We show that an effective reduction of model uncertainty can be achieved by using the proposed multi-tasking and ensemble methods in EHRs. Continuing with this idea, we extend our approach to black-box settings, including popular proprietary LMs such as GPT-4. We validate our framework using longitudinal clinical data from over 6,000 patients across ten clinical prediction tasks. Results show that ensembling methods and multi-task prediction prompts reduce uncertainty across different scenarios. These findings increase model transparency in white-box and black-box settings, thereby advancing reliable AI healthcare.
%R 10.18653/v1/2025.findings-naacl.419
%U https://aclanthology.org/2025.findings-naacl.419/
%U https://doi.org/10.18653/v1/2025.findings-naacl.419
%P 7512-7523
Markdown (Informal)
[Uncertainty Quantification for Clinical Outcome Predictions with (Large) Language Models](https://aclanthology.org/2025.findings-naacl.419/) (Chen et al., Findings 2025)
ACL