@inproceedings{rawat-etal-2024-diversitymedqa,
title = "{D}iversity{M}ed{QA}: A Benchmark for Assessing Demographic Biases in Medical Diagnosis using Large Language Models",
author = "Rawat, Rajat and
McBride, Hudson and
Nirmal, Dhiyaan Chakkresh and
Ghosh, Rajarshi and
Moon, Jong and
Alamuri, Dhruv Karthik and
Zhu, Kevin",
editor = "Dementieva, Daryna and
Ignat, Oana and
Jin, Zhijing and
Mihalcea, Rada and
Piatti, Giorgio and
Tetreault, Joel and
Wilson, Steven and
Zhao, Jieyu",
booktitle = "Proceedings of the Third Workshop on NLP for Positive Impact",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.nlp4pi-1.29",
pages = "334--348",
abstract = "As large language models (LLMs) gain traction in healthcare, concerns about their susceptibility to demographic biases are growing. We introduce DiversityMedQA, a novel benchmark designed to assess LLM responses to medical queries across diverse patient demographics, such as gender and ethnicity. By perturbing questions from the MedQA dataset, which comprises of medical board exam questions, we created a benchmark that captures the nuanced differences in medical diagnosis across varying patient profiles. To ensure that our perturbations did not alter the clinical outcomes, we implemented a filtering strategy to validate each perturbation, so that any performance discrepancies would be indicative of bias. Our findings reveal notable discrepancies in model performance when tested against these demographic variations. By releasing DiversityMedQA, we provide a resource for evaluating and mitigating demographic bias in LLM medical diagnoses.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rawat-etal-2024-diversitymedqa">
<titleInfo>
<title>DiversityMedQA: A Benchmark for Assessing Demographic Biases in Medical Diagnosis using Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rajat</namePart>
<namePart type="family">Rawat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hudson</namePart>
<namePart type="family">McBride</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhiyaan</namePart>
<namePart type="given">Chakkresh</namePart>
<namePart type="family">Nirmal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajarshi</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jong</namePart>
<namePart type="family">Moon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhruv</namePart>
<namePart type="given">Karthik</namePart>
<namePart type="family">Alamuri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on NLP for Positive Impact</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daryna</namePart>
<namePart type="family">Dementieva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oana</namePart>
<namePart type="family">Ignat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhijing</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rada</namePart>
<namePart type="family">Mihalcea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giorgio</namePart>
<namePart type="family">Piatti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joel</namePart>
<namePart type="family">Tetreault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Wilson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jieyu</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>As large language models (LLMs) gain traction in healthcare, concerns about their susceptibility to demographic biases are growing. We introduce DiversityMedQA, a novel benchmark designed to assess LLM responses to medical queries across diverse patient demographics, such as gender and ethnicity. By perturbing questions from the MedQA dataset, which comprises of medical board exam questions, we created a benchmark that captures the nuanced differences in medical diagnosis across varying patient profiles. To ensure that our perturbations did not alter the clinical outcomes, we implemented a filtering strategy to validate each perturbation, so that any performance discrepancies would be indicative of bias. Our findings reveal notable discrepancies in model performance when tested against these demographic variations. By releasing DiversityMedQA, we provide a resource for evaluating and mitigating demographic bias in LLM medical diagnoses.</abstract>
<identifier type="citekey">rawat-etal-2024-diversitymedqa</identifier>
<location>
<url>https://aclanthology.org/2024.nlp4pi-1.29</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>334</start>
<end>348</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DiversityMedQA: A Benchmark for Assessing Demographic Biases in Medical Diagnosis using Large Language Models
%A Rawat, Rajat
%A McBride, Hudson
%A Nirmal, Dhiyaan Chakkresh
%A Ghosh, Rajarshi
%A Moon, Jong
%A Alamuri, Dhruv Karthik
%A Zhu, Kevin
%Y Dementieva, Daryna
%Y Ignat, Oana
%Y Jin, Zhijing
%Y Mihalcea, Rada
%Y Piatti, Giorgio
%Y Tetreault, Joel
%Y Wilson, Steven
%Y Zhao, Jieyu
%S Proceedings of the Third Workshop on NLP for Positive Impact
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F rawat-etal-2024-diversitymedqa
%X As large language models (LLMs) gain traction in healthcare, concerns about their susceptibility to demographic biases are growing. We introduce DiversityMedQA, a novel benchmark designed to assess LLM responses to medical queries across diverse patient demographics, such as gender and ethnicity. By perturbing questions from the MedQA dataset, which comprises of medical board exam questions, we created a benchmark that captures the nuanced differences in medical diagnosis across varying patient profiles. To ensure that our perturbations did not alter the clinical outcomes, we implemented a filtering strategy to validate each perturbation, so that any performance discrepancies would be indicative of bias. Our findings reveal notable discrepancies in model performance when tested against these demographic variations. By releasing DiversityMedQA, we provide a resource for evaluating and mitigating demographic bias in LLM medical diagnoses.
%U https://aclanthology.org/2024.nlp4pi-1.29
%P 334-348
Markdown (Informal)
[DiversityMedQA: A Benchmark for Assessing Demographic Biases in Medical Diagnosis using Large Language Models](https://aclanthology.org/2024.nlp4pi-1.29) (Rawat et al., NLP4PI 2024)
ACL
- Rajat Rawat, Hudson McBride, Dhiyaan Chakkresh Nirmal, Rajarshi Ghosh, Jong Moon, Dhruv Karthik Alamuri, and Kevin Zhu. 2024. DiversityMedQA: A Benchmark for Assessing Demographic Biases in Medical Diagnosis using Large Language Models. In Proceedings of the Third Workshop on NLP for Positive Impact, pages 334–348, Miami, Florida, USA. Association for Computational Linguistics.