@inproceedings{li-etal-2026-beyond-benchmarks,
title = "Beyond Benchmarks: A Capability-Based Maturity Model for Systematic {AI} Integration in Hospitals",
author = "Li, Rui and
WU, Xiaofen and
Liu, Mingqian and
Song, Xiaoxia and
Xiangjun, Xu and
Qiao, Jiacheng and
Boqin, Zhuang and
Chen, Xu",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1047/",
pages = "20883--20895",
ISBN = "979-8-89176-395-1",
abstract = "Current Large Language Models (LLMs) demonstrate exceptional performance on medical benchmarks. However, models that excel in standardized tests focused on medical knowledge recall are not necessarily effective in real-world healthcare scenarios. This disparity between academic performance and clinical effectiveness stems from existing evaluations focusing overly on knowledge retrieval and QA, while neglecting high-load executive tasks in real clinical workflows. The effective execution of such tasks depends not only on model reasoning but also on the overall digital maturity of the healthcare institution. To address this, we propose a ``Capability-Based Hospital AI Maturity Model'' framework. This framework establishes a layered maturity system based on capabilities. By categorizing hospital AI capabilities into distinct maturity levels, it provides a clear, stepwise evolutionary path for hospitals, guiding them from foundational infrastructure construction to ubiquitous intelligence. Guided by this framework, we constructed ten representative real-world clinical scenarios as a reference test set and compared the performance of multiple models across benchmarks and real-world scenarios. Preliminary results suggest that, compared to relying solely on academic benchmark scores, this maturity assessment mode{---}which integrates system governance and scenario constraints{---}may provide a more valuable basis for AI adoption in medical institutions."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2026-beyond-benchmarks">
<titleInfo>
<title>Beyond Benchmarks: A Capability-Based Maturity Model for Systematic AI Integration in Hospitals</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaofen</namePart>
<namePart type="family">WU</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingqian</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaoxia</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xu</namePart>
<namePart type="family">Xiangjun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiacheng</namePart>
<namePart type="family">Qiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuang</namePart>
<namePart type="family">Boqin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Current Large Language Models (LLMs) demonstrate exceptional performance on medical benchmarks. However, models that excel in standardized tests focused on medical knowledge recall are not necessarily effective in real-world healthcare scenarios. This disparity between academic performance and clinical effectiveness stems from existing evaluations focusing overly on knowledge retrieval and QA, while neglecting high-load executive tasks in real clinical workflows. The effective execution of such tasks depends not only on model reasoning but also on the overall digital maturity of the healthcare institution. To address this, we propose a “Capability-Based Hospital AI Maturity Model” framework. This framework establishes a layered maturity system based on capabilities. By categorizing hospital AI capabilities into distinct maturity levels, it provides a clear, stepwise evolutionary path for hospitals, guiding them from foundational infrastructure construction to ubiquitous intelligence. Guided by this framework, we constructed ten representative real-world clinical scenarios as a reference test set and compared the performance of multiple models across benchmarks and real-world scenarios. Preliminary results suggest that, compared to relying solely on academic benchmark scores, this maturity assessment mode—which integrates system governance and scenario constraints—may provide a more valuable basis for AI adoption in medical institutions.</abstract>
<identifier type="citekey">li-etal-2026-beyond-benchmarks</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1047/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>20883</start>
<end>20895</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Beyond Benchmarks: A Capability-Based Maturity Model for Systematic AI Integration in Hospitals
%A Li, Rui
%A WU, Xiaofen
%A Liu, Mingqian
%A Song, Xiaoxia
%A Xiangjun, Xu
%A Qiao, Jiacheng
%A Boqin, Zhuang
%A Chen, Xu
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F li-etal-2026-beyond-benchmarks
%X Current Large Language Models (LLMs) demonstrate exceptional performance on medical benchmarks. However, models that excel in standardized tests focused on medical knowledge recall are not necessarily effective in real-world healthcare scenarios. This disparity between academic performance and clinical effectiveness stems from existing evaluations focusing overly on knowledge retrieval and QA, while neglecting high-load executive tasks in real clinical workflows. The effective execution of such tasks depends not only on model reasoning but also on the overall digital maturity of the healthcare institution. To address this, we propose a “Capability-Based Hospital AI Maturity Model” framework. This framework establishes a layered maturity system based on capabilities. By categorizing hospital AI capabilities into distinct maturity levels, it provides a clear, stepwise evolutionary path for hospitals, guiding them from foundational infrastructure construction to ubiquitous intelligence. Guided by this framework, we constructed ten representative real-world clinical scenarios as a reference test set and compared the performance of multiple models across benchmarks and real-world scenarios. Preliminary results suggest that, compared to relying solely on academic benchmark scores, this maturity assessment mode—which integrates system governance and scenario constraints—may provide a more valuable basis for AI adoption in medical institutions.
%U https://aclanthology.org/2026.findings-acl.1047/
%P 20883-20895
Markdown (Informal)
[Beyond Benchmarks: A Capability-Based Maturity Model for Systematic AI Integration in Hospitals](https://aclanthology.org/2026.findings-acl.1047/) (Li et al., Findings 2026)
ACL
- Rui Li, Xiaofen WU, Mingqian Liu, Xiaoxia Song, Xu Xiangjun, Jiacheng Qiao, Zhuang Boqin, and Xu Chen. 2026. Beyond Benchmarks: A Capability-Based Maturity Model for Systematic AI Integration in Hospitals. In Findings of the Association for Computational Linguistics: ACL 2026, pages 20883–20895, San Diego, California, United States. Association for Computational Linguistics.