@inproceedings{yoo-2026-demographic,
title = "When Demographic Sensitivity Isn{'}t What It Seems: Baseline-Aware Counterfactual Audits for Clinical {NLP}",
author = "Yoo, Hyunwoo",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.bionlp-1.14/",
pages = "141--155",
ISBN = "979-8-89176-434-7",
abstract = "Clinical NLP systems are increasingly used for triage support, prediction, and decision assistance in EHR-based settings, where demographic fairness is a critical concern. A common evaluation approach is counterfactual demographic perturbation: modifying attributes such as age or sex while holding clinical evidence fixed and measuring output changes. However, we show that such counterfactual audits can be misleading when interpreted in isolation. Across three clinical LLMs, we find that non-demographic control perturbations (e.g., paraphrases) often induce output variability comparable to or greater than demographic edits. This can contribute to overestimation or misinterpretation of demographic bias.To address this, we propose a baseline-aware audit framework that explicitly compares demographic perturbations against control baselines. Our analysis reveals that (i) label-level stability can mask substantial variation in generated rationales and recommendations, and (ii) age-based perturbations generally induce larger effects than sex-based ones in borderline cases. Crucially, we identify a high intrinsic instability ({''}noise floor''; 0.46{--}0.71 Jaccard instability) in clinical LLM generations, while additional matched-metric analyses show that demographic perturbations are often comparable to non-demographic baseline variability.These findings highlight a key limitation of existing fairness evaluations: without establishing appropriate baselines, apparent demographic sensitivity may be over- or mis-attributed to bias rather than broader generative instability. We argue that baseline-aware counterfactual audits, which explicitly compare demographic effects against intrinsic model noise, provide a more reliable lens for evaluating clinical NLP systems in high-stakes settings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yoo-2026-demographic">
<titleInfo>
<title>When Demographic Sensitivity Isn’t What It Seems: Baseline-Aware Counterfactual Audits for Clinical NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hyunwoo</namePart>
<namePart type="family">Yoo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>BioNLP 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-434-7</identifier>
</relatedItem>
<abstract>Clinical NLP systems are increasingly used for triage support, prediction, and decision assistance in EHR-based settings, where demographic fairness is a critical concern. A common evaluation approach is counterfactual demographic perturbation: modifying attributes such as age or sex while holding clinical evidence fixed and measuring output changes. However, we show that such counterfactual audits can be misleading when interpreted in isolation. Across three clinical LLMs, we find that non-demographic control perturbations (e.g., paraphrases) often induce output variability comparable to or greater than demographic edits. This can contribute to overestimation or misinterpretation of demographic bias.To address this, we propose a baseline-aware audit framework that explicitly compares demographic perturbations against control baselines. Our analysis reveals that (i) label-level stability can mask substantial variation in generated rationales and recommendations, and (ii) age-based perturbations generally induce larger effects than sex-based ones in borderline cases. Crucially, we identify a high intrinsic instability (”noise floor”; 0.46–0.71 Jaccard instability) in clinical LLM generations, while additional matched-metric analyses show that demographic perturbations are often comparable to non-demographic baseline variability.These findings highlight a key limitation of existing fairness evaluations: without establishing appropriate baselines, apparent demographic sensitivity may be over- or mis-attributed to bias rather than broader generative instability. We argue that baseline-aware counterfactual audits, which explicitly compare demographic effects against intrinsic model noise, provide a more reliable lens for evaluating clinical NLP systems in high-stakes settings.</abstract>
<identifier type="citekey">yoo-2026-demographic</identifier>
<location>
<url>https://aclanthology.org/2026.bionlp-1.14/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>141</start>
<end>155</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T When Demographic Sensitivity Isn’t What It Seems: Baseline-Aware Counterfactual Audits for Clinical NLP
%A Yoo, Hyunwoo
%Y Demner-Fushman, Dina
%Y Ananiadou, Sophia
%Y Roberts, Kirk
%Y Tsujii, Junichi
%S BioNLP 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-434-7
%F yoo-2026-demographic
%X Clinical NLP systems are increasingly used for triage support, prediction, and decision assistance in EHR-based settings, where demographic fairness is a critical concern. A common evaluation approach is counterfactual demographic perturbation: modifying attributes such as age or sex while holding clinical evidence fixed and measuring output changes. However, we show that such counterfactual audits can be misleading when interpreted in isolation. Across three clinical LLMs, we find that non-demographic control perturbations (e.g., paraphrases) often induce output variability comparable to or greater than demographic edits. This can contribute to overestimation or misinterpretation of demographic bias.To address this, we propose a baseline-aware audit framework that explicitly compares demographic perturbations against control baselines. Our analysis reveals that (i) label-level stability can mask substantial variation in generated rationales and recommendations, and (ii) age-based perturbations generally induce larger effects than sex-based ones in borderline cases. Crucially, we identify a high intrinsic instability (”noise floor”; 0.46–0.71 Jaccard instability) in clinical LLM generations, while additional matched-metric analyses show that demographic perturbations are often comparable to non-demographic baseline variability.These findings highlight a key limitation of existing fairness evaluations: without establishing appropriate baselines, apparent demographic sensitivity may be over- or mis-attributed to bias rather than broader generative instability. We argue that baseline-aware counterfactual audits, which explicitly compare demographic effects against intrinsic model noise, provide a more reliable lens for evaluating clinical NLP systems in high-stakes settings.
%U https://aclanthology.org/2026.bionlp-1.14/
%P 141-155
Markdown (Informal)
[When Demographic Sensitivity Isn’t What It Seems: Baseline-Aware Counterfactual Audits for Clinical NLP](https://aclanthology.org/2026.bionlp-1.14/) (Yoo, BioNLP 2026)
ACL