@inproceedings{diekmann-etal-2025-llms,
title = "{LLM}s as Medical Safety Judges: Evaluating Alignment with Human Annotation in Patient-Facing {QA}",
author = "Diekmann, Yella and
Fensore, Chase and
Carrillo-Larco, Rodrigo and
Castejon Rosales, Eduard and
Shiromani, Sakshi and
Pai, Rima and
Shah, Megha and
Ho, Joyce",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Miwa, Makoto and
Tsujii, Junichi",
booktitle = "Proceedings of the 24th Workshop on Biomedical Language Processing",
month = aug,
year = "2025",
address = "Viena, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bionlp-1.19/",
doi = "10.18653/v1/2025.bionlp-1.19",
pages = "217--224",
ISBN = "979-8-89176-275-6",
abstract = "The increasing deployment of LLMs in patient-facing medical QA raises concerns about the reliability and safety of their responses. Traditional evaluation methods rely on expert human annotation, which is costly, time-consuming, and difficult to scale. This study explores the feasibility of using LLMs as automated judges for medical QA evaluation. We benchmark LLMs against human annotators across eight qualitative safety metrics and introduce adversarial question augmentation to assess LLMs' robustness in evaluating medical responses. Our findings reveal that while LLMs achieve high accuracy in objective metrics such as scientific consensus and grammaticality, they struggle with more subjective categories like empathy and extent of harm. This work contributes to the ongoing discussion on automating safety assessments in medical AI and informs the development of more reliable evaluation methodologies."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="diekmann-etal-2025-llms">
<titleInfo>
<title>LLMs as Medical Safety Judges: Evaluating Alignment with Human Annotation in Patient-Facing QA</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yella</namePart>
<namePart type="family">Diekmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chase</namePart>
<namePart type="family">Fensore</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rodrigo</namePart>
<namePart type="family">Carrillo-Larco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eduard</namePart>
<namePart type="family">Castejon Rosales</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakshi</namePart>
<namePart type="family">Shiromani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rima</namePart>
<namePart type="family">Pai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Megha</namePart>
<namePart type="family">Shah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Ho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 24th Workshop on Biomedical Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Makoto</namePart>
<namePart type="family">Miwa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Viena, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-275-6</identifier>
</relatedItem>
<abstract>The increasing deployment of LLMs in patient-facing medical QA raises concerns about the reliability and safety of their responses. Traditional evaluation methods rely on expert human annotation, which is costly, time-consuming, and difficult to scale. This study explores the feasibility of using LLMs as automated judges for medical QA evaluation. We benchmark LLMs against human annotators across eight qualitative safety metrics and introduce adversarial question augmentation to assess LLMs’ robustness in evaluating medical responses. Our findings reveal that while LLMs achieve high accuracy in objective metrics such as scientific consensus and grammaticality, they struggle with more subjective categories like empathy and extent of harm. This work contributes to the ongoing discussion on automating safety assessments in medical AI and informs the development of more reliable evaluation methodologies.</abstract>
<identifier type="citekey">diekmann-etal-2025-llms</identifier>
<identifier type="doi">10.18653/v1/2025.bionlp-1.19</identifier>
<location>
<url>https://aclanthology.org/2025.bionlp-1.19/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>217</start>
<end>224</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LLMs as Medical Safety Judges: Evaluating Alignment with Human Annotation in Patient-Facing QA
%A Diekmann, Yella
%A Fensore, Chase
%A Carrillo-Larco, Rodrigo
%A Castejon Rosales, Eduard
%A Shiromani, Sakshi
%A Pai, Rima
%A Shah, Megha
%A Ho, Joyce
%Y Demner-Fushman, Dina
%Y Ananiadou, Sophia
%Y Miwa, Makoto
%Y Tsujii, Junichi
%S Proceedings of the 24th Workshop on Biomedical Language Processing
%D 2025
%8 August
%I Association for Computational Linguistics
%C Viena, Austria
%@ 979-8-89176-275-6
%F diekmann-etal-2025-llms
%X The increasing deployment of LLMs in patient-facing medical QA raises concerns about the reliability and safety of their responses. Traditional evaluation methods rely on expert human annotation, which is costly, time-consuming, and difficult to scale. This study explores the feasibility of using LLMs as automated judges for medical QA evaluation. We benchmark LLMs against human annotators across eight qualitative safety metrics and introduce adversarial question augmentation to assess LLMs’ robustness in evaluating medical responses. Our findings reveal that while LLMs achieve high accuracy in objective metrics such as scientific consensus and grammaticality, they struggle with more subjective categories like empathy and extent of harm. This work contributes to the ongoing discussion on automating safety assessments in medical AI and informs the development of more reliable evaluation methodologies.
%R 10.18653/v1/2025.bionlp-1.19
%U https://aclanthology.org/2025.bionlp-1.19/
%U https://doi.org/10.18653/v1/2025.bionlp-1.19
%P 217-224
Markdown (Informal)
[LLMs as Medical Safety Judges: Evaluating Alignment with Human Annotation in Patient-Facing QA](https://aclanthology.org/2025.bionlp-1.19/) (Diekmann et al., BioNLP 2025)
ACL
- Yella Diekmann, Chase Fensore, Rodrigo Carrillo-Larco, Eduard Castejon Rosales, Sakshi Shiromani, Rima Pai, Megha Shah, and Joyce Ho. 2025. LLMs as Medical Safety Judges: Evaluating Alignment with Human Annotation in Patient-Facing QA. In Proceedings of the 24th Workshop on Biomedical Language Processing, pages 217–224, Viena, Austria. Association for Computational Linguistics.