@inproceedings{maslenkova-etal-2025-building,
title = "Building Trust in Clinical {LLM}s: Bias Analysis and Dataset Transparency",
author = "Maslenkova, Svetlana and
Christophe, Clement and
Pimentel, Marco AF and
Raha, Tathagata and
Salman, Muhammad Umar and
Al Mahrooqi, Ahmed and
Gupta, Avani and
Khan, Shadab and
Rajan, Ronnie and
Kanithi, Praveenkumar",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1174/",
doi = "10.18653/v1/2025.emnlp-main.1174",
pages = "23032--23055",
ISBN = "979-8-89176-332-6",
abstract = "Large language models offer transformative potential for healthcare, yet their responsible and equitable development depends critically on a deeper understanding of how training data characteristics influence model behavior, including the potential for bias. Current practices in dataset curation and bias assessment often lack the necessary transparency, creating an urgent need for comprehensive evaluation frameworks to foster trust and guide improvements. In this study, we present an in-depth analysis of potential downstream biases in clinical language models, with a focus on differential opioid prescription tendencies across diverse demographic groups, such as ethnicity, gender, and age. As part of this investigation, we introduce HC4: Healthcare Comprehensive Commons Corpus, a novel and extensively curated pretraining dataset exceeding 89 billion tokens. Our evaluation leverages both established general benchmarks and a novel, healthcare-specific methodology, offering crucial insights to support fairness and safety in clinical AI applications."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="maslenkova-etal-2025-building">
<titleInfo>
<title>Building Trust in Clinical LLMs: Bias Analysis and Dataset Transparency</title>
</titleInfo>
<name type="personal">
<namePart type="given">Svetlana</namePart>
<namePart type="family">Maslenkova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clement</namePart>
<namePart type="family">Christophe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="given">AF</namePart>
<namePart type="family">Pimentel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tathagata</namePart>
<namePart type="family">Raha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="given">Umar</namePart>
<namePart type="family">Salman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Al Mahrooqi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avani</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shadab</namePart>
<namePart type="family">Khan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ronnie</namePart>
<namePart type="family">Rajan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Praveenkumar</namePart>
<namePart type="family">Kanithi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Large language models offer transformative potential for healthcare, yet their responsible and equitable development depends critically on a deeper understanding of how training data characteristics influence model behavior, including the potential for bias. Current practices in dataset curation and bias assessment often lack the necessary transparency, creating an urgent need for comprehensive evaluation frameworks to foster trust and guide improvements. In this study, we present an in-depth analysis of potential downstream biases in clinical language models, with a focus on differential opioid prescription tendencies across diverse demographic groups, such as ethnicity, gender, and age. As part of this investigation, we introduce HC4: Healthcare Comprehensive Commons Corpus, a novel and extensively curated pretraining dataset exceeding 89 billion tokens. Our evaluation leverages both established general benchmarks and a novel, healthcare-specific methodology, offering crucial insights to support fairness and safety in clinical AI applications.</abstract>
<identifier type="citekey">maslenkova-etal-2025-building</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.1174</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1174/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>23032</start>
<end>23055</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Building Trust in Clinical LLMs: Bias Analysis and Dataset Transparency
%A Maslenkova, Svetlana
%A Christophe, Clement
%A Pimentel, Marco AF
%A Raha, Tathagata
%A Salman, Muhammad Umar
%A Al Mahrooqi, Ahmed
%A Gupta, Avani
%A Khan, Shadab
%A Rajan, Ronnie
%A Kanithi, Praveenkumar
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F maslenkova-etal-2025-building
%X Large language models offer transformative potential for healthcare, yet their responsible and equitable development depends critically on a deeper understanding of how training data characteristics influence model behavior, including the potential for bias. Current practices in dataset curation and bias assessment often lack the necessary transparency, creating an urgent need for comprehensive evaluation frameworks to foster trust and guide improvements. In this study, we present an in-depth analysis of potential downstream biases in clinical language models, with a focus on differential opioid prescription tendencies across diverse demographic groups, such as ethnicity, gender, and age. As part of this investigation, we introduce HC4: Healthcare Comprehensive Commons Corpus, a novel and extensively curated pretraining dataset exceeding 89 billion tokens. Our evaluation leverages both established general benchmarks and a novel, healthcare-specific methodology, offering crucial insights to support fairness and safety in clinical AI applications.
%R 10.18653/v1/2025.emnlp-main.1174
%U https://aclanthology.org/2025.emnlp-main.1174/
%U https://doi.org/10.18653/v1/2025.emnlp-main.1174
%P 23032-23055
Markdown (Informal)
[Building Trust in Clinical LLMs: Bias Analysis and Dataset Transparency](https://aclanthology.org/2025.emnlp-main.1174/) (Maslenkova et al., EMNLP 2025)
ACL
- Svetlana Maslenkova, Clement Christophe, Marco AF Pimentel, Tathagata Raha, Muhammad Umar Salman, Ahmed Al Mahrooqi, Avani Gupta, Shadab Khan, Ronnie Rajan, and Praveenkumar Kanithi. 2025. Building Trust in Clinical LLMs: Bias Analysis and Dataset Transparency. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 23032–23055, Suzhou, China. Association for Computational Linguistics.