@inproceedings{ghosh-etal-2025-medequalqa,
title = "{MEDEQUALQA}: Evaluating Biases in {LLM}s with Counterfactual Reasoning",
author = "Ghosh, Rajarshi and
Gupta, Abhay and
McBride, Hudson and
Vaidya, Anurag Jayant and
Mahmood, Faisal",
editor = "Zhao, Wei and
D{'}Souza, Jennifer and
Eger, Steffen and
Lauscher, Anne and
Hou, Yufang and
Sadat Moosavi, Nafise and
Miller, Tristan and
Lin, Chenghua",
booktitle = "Proceedings of The First Workshop on Human{--}LLM Collaboration for Ethical and Responsible Science Production (SciProdLLM)",
month = dec,
year = "2025",
address = "Mumbai, India (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sciprodllm-1.4/",
pages = "25--37",
ISBN = "979-8-89176-307-4",
abstract = "Large language models (LLMs) are increasingly deployed in clinical decision support, yet subtle demographic cues can influence their reasoning. Prior work has documented disparities in outputs across patient groups, but little is known about how internal reasoning shifts under controlled demographic changes. We introduce MEDEQUALQA, a counterfactual benchmark that perturbs only patient pronouns (he/him, she/her, they/them) while holding critical symptoms and conditions (CSCs) constant. Each vignette is expanded into single-CSC ablations, producing three parallel datasets of approximately 23k items each (69k total). We evaluate a frontier LLM and compute Semantic Textual Similarity (STS) between reasoning traces to measure stability across pronoun variants. Our results show overall high similarity (mean STS {\ensuremath{>}} 0.80) but reveal consistent localized divergences in cited risk factors, guideline anchors, and differential ordering{---}even when final diagnoses remain unchanged. Error analysis identifies specific cases where reasoning shifts occur, highlighting clinically relevant bias loci that may cascade into inequitable care. MEDEQUALQA provides a controlled diagnostic setting for auditing reasoning stability in medical AI."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ghosh-etal-2025-medequalqa">
<titleInfo>
<title>MEDEQUALQA: Evaluating Biases in LLMs with Counterfactual Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rajarshi</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhay</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hudson</namePart>
<namePart type="family">McBride</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anurag</namePart>
<namePart type="given">Jayant</namePart>
<namePart type="family">Vaidya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Faisal</namePart>
<namePart type="family">Mahmood</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of The First Workshop on Human–LLM Collaboration for Ethical and Responsible Science Production (SciProdLLM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jennifer</namePart>
<namePart type="family">D’Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steffen</namePart>
<namePart type="family">Eger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anne</namePart>
<namePart type="family">Lauscher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yufang</namePart>
<namePart type="family">Hou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nafise</namePart>
<namePart type="family">Sadat Moosavi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tristan</namePart>
<namePart type="family">Miller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenghua</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India (Hybrid)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-307-4</identifier>
</relatedItem>
<abstract>Large language models (LLMs) are increasingly deployed in clinical decision support, yet subtle demographic cues can influence their reasoning. Prior work has documented disparities in outputs across patient groups, but little is known about how internal reasoning shifts under controlled demographic changes. We introduce MEDEQUALQA, a counterfactual benchmark that perturbs only patient pronouns (he/him, she/her, they/them) while holding critical symptoms and conditions (CSCs) constant. Each vignette is expanded into single-CSC ablations, producing three parallel datasets of approximately 23k items each (69k total). We evaluate a frontier LLM and compute Semantic Textual Similarity (STS) between reasoning traces to measure stability across pronoun variants. Our results show overall high similarity (mean STS \ensuremath> 0.80) but reveal consistent localized divergences in cited risk factors, guideline anchors, and differential ordering—even when final diagnoses remain unchanged. Error analysis identifies specific cases where reasoning shifts occur, highlighting clinically relevant bias loci that may cascade into inequitable care. MEDEQUALQA provides a controlled diagnostic setting for auditing reasoning stability in medical AI.</abstract>
<identifier type="citekey">ghosh-etal-2025-medequalqa</identifier>
<location>
<url>https://aclanthology.org/2025.sciprodllm-1.4/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>25</start>
<end>37</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MEDEQUALQA: Evaluating Biases in LLMs with Counterfactual Reasoning
%A Ghosh, Rajarshi
%A Gupta, Abhay
%A McBride, Hudson
%A Vaidya, Anurag Jayant
%A Mahmood, Faisal
%Y Zhao, Wei
%Y D’Souza, Jennifer
%Y Eger, Steffen
%Y Lauscher, Anne
%Y Hou, Yufang
%Y Sadat Moosavi, Nafise
%Y Miller, Tristan
%Y Lin, Chenghua
%S Proceedings of The First Workshop on Human–LLM Collaboration for Ethical and Responsible Science Production (SciProdLLM)
%D 2025
%8 December
%I Association for Computational Linguistics
%C Mumbai, India (Hybrid)
%@ 979-8-89176-307-4
%F ghosh-etal-2025-medequalqa
%X Large language models (LLMs) are increasingly deployed in clinical decision support, yet subtle demographic cues can influence their reasoning. Prior work has documented disparities in outputs across patient groups, but little is known about how internal reasoning shifts under controlled demographic changes. We introduce MEDEQUALQA, a counterfactual benchmark that perturbs only patient pronouns (he/him, she/her, they/them) while holding critical symptoms and conditions (CSCs) constant. Each vignette is expanded into single-CSC ablations, producing three parallel datasets of approximately 23k items each (69k total). We evaluate a frontier LLM and compute Semantic Textual Similarity (STS) between reasoning traces to measure stability across pronoun variants. Our results show overall high similarity (mean STS \ensuremath> 0.80) but reveal consistent localized divergences in cited risk factors, guideline anchors, and differential ordering—even when final diagnoses remain unchanged. Error analysis identifies specific cases where reasoning shifts occur, highlighting clinically relevant bias loci that may cascade into inequitable care. MEDEQUALQA provides a controlled diagnostic setting for auditing reasoning stability in medical AI.
%U https://aclanthology.org/2025.sciprodllm-1.4/
%P 25-37
Markdown (Informal)
[MEDEQUALQA: Evaluating Biases in LLMs with Counterfactual Reasoning](https://aclanthology.org/2025.sciprodllm-1.4/) (Ghosh et al., SciProdLLM 2025)
ACL
- Rajarshi Ghosh, Abhay Gupta, Hudson McBride, Anurag Jayant Vaidya, and Faisal Mahmood. 2025. MEDEQUALQA: Evaluating Biases in LLMs with Counterfactual Reasoning. In Proceedings of The First Workshop on Human–LLM Collaboration for Ethical and Responsible Science Production (SciProdLLM), pages 25–37, Mumbai, India (Hybrid). Association for Computational Linguistics.