@inproceedings{liu-etal-2026-benchmarking,
title = "Benchmarking {LLM}s on Authentic Cases from Medical Journals",
author = "Liu, Wanlong and
Chen, Junying and
Yang, Yunjin and
Tiwari, Prayag and
Chen, Wenyu and
Wang, Benyou",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.767/",
pages = "15651--15675",
ISBN = "979-8-89176-395-1",
abstract = "In recent years, large language models (LLMs) have demonstrated remarkable capabilities in the medical domain. However, existing medical benchmarks suffer from performance saturation and are predominantly derived from medical exam questions, which fail to reflect the complexity of real-world clinical scenarios.To bridge this gap, we introduce ClinBench, a challenging benchmark based on authentic clinical cases sourced from authoritative medical journals. Each question retains the complete patient information and clinical test results from the original case, effectively simulating real-world clinical practice. Additionally, we implement a rigorous human review process involving medical experts to ensure the quality and reliability of the benchmark. ClinBench supports both textual and multimodal evaluation formats, covering 11 medical specialties with over 2,000 questions, including a dedicated rare disease track, providing a comprehensive resource for assessing the medical reasoning capabilities of LLMs. We evaluate the performance of over 20 open-source and proprietary LLMs and benchmark them against human medical experts. Our findings reveal that human experts still retain an advantage within their specialized fields, while LLMs demonstrate superior overall performance on a broader range of medical specialties."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2026-benchmarking">
<titleInfo>
<title>Benchmarking LLMs on Authentic Cases from Medical Journals</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanlong</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junying</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunjin</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prayag</namePart>
<namePart type="family">Tiwari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenyu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benyou</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>In recent years, large language models (LLMs) have demonstrated remarkable capabilities in the medical domain. However, existing medical benchmarks suffer from performance saturation and are predominantly derived from medical exam questions, which fail to reflect the complexity of real-world clinical scenarios.To bridge this gap, we introduce ClinBench, a challenging benchmark based on authentic clinical cases sourced from authoritative medical journals. Each question retains the complete patient information and clinical test results from the original case, effectively simulating real-world clinical practice. Additionally, we implement a rigorous human review process involving medical experts to ensure the quality and reliability of the benchmark. ClinBench supports both textual and multimodal evaluation formats, covering 11 medical specialties with over 2,000 questions, including a dedicated rare disease track, providing a comprehensive resource for assessing the medical reasoning capabilities of LLMs. We evaluate the performance of over 20 open-source and proprietary LLMs and benchmark them against human medical experts. Our findings reveal that human experts still retain an advantage within their specialized fields, while LLMs demonstrate superior overall performance on a broader range of medical specialties.</abstract>
<identifier type="citekey">liu-etal-2026-benchmarking</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.767/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>15651</start>
<end>15675</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Benchmarking LLMs on Authentic Cases from Medical Journals
%A Liu, Wanlong
%A Chen, Junying
%A Yang, Yunjin
%A Tiwari, Prayag
%A Chen, Wenyu
%A Wang, Benyou
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F liu-etal-2026-benchmarking
%X In recent years, large language models (LLMs) have demonstrated remarkable capabilities in the medical domain. However, existing medical benchmarks suffer from performance saturation and are predominantly derived from medical exam questions, which fail to reflect the complexity of real-world clinical scenarios.To bridge this gap, we introduce ClinBench, a challenging benchmark based on authentic clinical cases sourced from authoritative medical journals. Each question retains the complete patient information and clinical test results from the original case, effectively simulating real-world clinical practice. Additionally, we implement a rigorous human review process involving medical experts to ensure the quality and reliability of the benchmark. ClinBench supports both textual and multimodal evaluation formats, covering 11 medical specialties with over 2,000 questions, including a dedicated rare disease track, providing a comprehensive resource for assessing the medical reasoning capabilities of LLMs. We evaluate the performance of over 20 open-source and proprietary LLMs and benchmark them against human medical experts. Our findings reveal that human experts still retain an advantage within their specialized fields, while LLMs demonstrate superior overall performance on a broader range of medical specialties.
%U https://aclanthology.org/2026.findings-acl.767/
%P 15651-15675
Markdown (Informal)
[Benchmarking LLMs on Authentic Cases from Medical Journals](https://aclanthology.org/2026.findings-acl.767/) (Liu et al., Findings 2026)
ACL
- Wanlong Liu, Junying Chen, Yunjin Yang, Prayag Tiwari, Wenyu Chen, and Benyou Wang. 2026. Benchmarking LLMs on Authentic Cases from Medical Journals. In Findings of the Association for Computational Linguistics: ACL 2026, pages 15651–15675, San Diego, California, United States. Association for Computational Linguistics.