@inproceedings{krishna-etal-2026-medact,
title = "{M}ed{A}ct: Removing the Human Bottleneck in Benchmarking Clinical {LLM} Safety",
author = "Krishna, Arjun and
Pridgen, Brian and
Silverstein, Max",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.gem-main.24/",
pages = "222--230",
ISBN = "979-8-89176-423-1",
abstract = "Most medical benchmarks for large language models test factual recall through multiple-choice questions, but on-the-ground physicians do not have the luxury of four options to choose from. NOHARM (Wu et al., 2025) demonstrated this limitation using 100 real eConsult cases annotated by 29 board-certified physicians, showing that action-level evaluation reveals omission and commission failure modes invisible to multiple-choice tests. However, NOHARM{'}s cases are closed and their creation required substantial expert physician time, creating a human bottleneck that limits the scalability and openness of this evaluation approach. We present MedAct, an open replication of NOHARM{'}s evaluation methodology using synthetically generated cases. Our contribution is a multi-stage generation pipeline that uses language models grounded in clinical practice guidelines to produce 100 cases across ten specialties, each containing roughly 50 plausible next-step actions labeled as Appropriate or Inappropriate using NOHARM{'}sscoring framework. The pipeline includes structural quality controls: 83 of 100 cases pass all five automated checks, and answer-leaking language appears in only 0.06{\%} of actions. In a pilot evaluation of nine contemporary LLMs using this synthetic benchmark, we observe patterns consistent with NOHARM{'}s findings on human-curated cases, including that omissions dominate error volume while commissions dominate severe errors. We release all cases, rubrics, generation tooling, and scoring code openly, removing the human-bottleneck barrier to action-level clinical LLM evaluation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="krishna-etal-2026-medact">
<titleInfo>
<title>MedAct: Removing the Human Bottleneck in Benchmarking Clinical LLM Safety</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arjun</namePart>
<namePart type="family">Krishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Pridgen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Max</namePart>
<namePart type="family">Silverstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrícia</namePart>
<namePart type="family">Schmidtová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Dušek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marzieh</namePart>
<namePart type="family">Fadaee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-423-1</identifier>
</relatedItem>
<abstract>Most medical benchmarks for large language models test factual recall through multiple-choice questions, but on-the-ground physicians do not have the luxury of four options to choose from. NOHARM (Wu et al., 2025) demonstrated this limitation using 100 real eConsult cases annotated by 29 board-certified physicians, showing that action-level evaluation reveals omission and commission failure modes invisible to multiple-choice tests. However, NOHARM’s cases are closed and their creation required substantial expert physician time, creating a human bottleneck that limits the scalability and openness of this evaluation approach. We present MedAct, an open replication of NOHARM’s evaluation methodology using synthetically generated cases. Our contribution is a multi-stage generation pipeline that uses language models grounded in clinical practice guidelines to produce 100 cases across ten specialties, each containing roughly 50 plausible next-step actions labeled as Appropriate or Inappropriate using NOHARM’sscoring framework. The pipeline includes structural quality controls: 83 of 100 cases pass all five automated checks, and answer-leaking language appears in only 0.06% of actions. In a pilot evaluation of nine contemporary LLMs using this synthetic benchmark, we observe patterns consistent with NOHARM’s findings on human-curated cases, including that omissions dominate error volume while commissions dominate severe errors. We release all cases, rubrics, generation tooling, and scoring code openly, removing the human-bottleneck barrier to action-level clinical LLM evaluation.</abstract>
<identifier type="citekey">krishna-etal-2026-medact</identifier>
<location>
<url>https://aclanthology.org/2026.gem-main.24/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>222</start>
<end>230</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MedAct: Removing the Human Bottleneck in Benchmarking Clinical LLM Safety
%A Krishna, Arjun
%A Pridgen, Brian
%A Silverstein, Max
%Y Mille, Simon
%Y Gehrmann, Sebastian
%Y Schmidtová, Patrícia
%Y Dušek, Ondřej
%Y Fadaee, Marzieh
%Y Lo, Kyle
%Y Santus, Enrico
%Y Stanovsky, Gabriel
%S Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-423-1
%F krishna-etal-2026-medact
%X Most medical benchmarks for large language models test factual recall through multiple-choice questions, but on-the-ground physicians do not have the luxury of four options to choose from. NOHARM (Wu et al., 2025) demonstrated this limitation using 100 real eConsult cases annotated by 29 board-certified physicians, showing that action-level evaluation reveals omission and commission failure modes invisible to multiple-choice tests. However, NOHARM’s cases are closed and their creation required substantial expert physician time, creating a human bottleneck that limits the scalability and openness of this evaluation approach. We present MedAct, an open replication of NOHARM’s evaluation methodology using synthetically generated cases. Our contribution is a multi-stage generation pipeline that uses language models grounded in clinical practice guidelines to produce 100 cases across ten specialties, each containing roughly 50 plausible next-step actions labeled as Appropriate or Inappropriate using NOHARM’sscoring framework. The pipeline includes structural quality controls: 83 of 100 cases pass all five automated checks, and answer-leaking language appears in only 0.06% of actions. In a pilot evaluation of nine contemporary LLMs using this synthetic benchmark, we observe patterns consistent with NOHARM’s findings on human-curated cases, including that omissions dominate error volume while commissions dominate severe errors. We release all cases, rubrics, generation tooling, and scoring code openly, removing the human-bottleneck barrier to action-level clinical LLM evaluation.
%U https://aclanthology.org/2026.gem-main.24/
%P 222-230
Markdown (Informal)
[MedAct: Removing the Human Bottleneck in Benchmarking Clinical LLM Safety](https://aclanthology.org/2026.gem-main.24/) (Krishna et al., GEM 2026)
ACL