@inproceedings{he-etal-2025-astrid,
title = "{ASTRID} - An Automated and Scalable {TRI}a{D} for the Evaluation of {RAG}-based Clinical Question Answering Systems",
author = "He, Yajie Vera and
Chowdhury, Mohita and
Joselowitz, Jared and
Higham, Aisling and
Lim, Ernest",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.857/",
doi = "10.18653/v1/2025.findings-acl.857",
pages = "16700--16716",
ISBN = "979-8-89176-256-5",
abstract = "Large Language Models (LLMs) have shown impressive potential in clinical question answering (QA), with Retrieval Augmented Generation (RAG) emerging as a leading approach for ensuring the factual accuracy of model responses. However, current automated RAG metrics perform poorly in clinical and conversational use cases. Using clinical human evaluations of responses is expensive, unscalable, and not conducive to the continuous iterative development of RAG systems. To address these challenges, we introduce ASTRID - an Automated and Scalable TRIaD for evaluating clinical QA systems leveraging RAG - consisting of three metrics: Context Relevance (CR), Refusal Accuracy (RA), and Conversational Faithfulness (CF). Our novel evaluation metric, CF, is designed to better capture the faithfulness of a model{'}s response to the knowledge base without penalising conversational elements. Additionally, our metric RA captures the refusal to address questions outside of the system{'}s scope of practice. To validate our triad, we curate a dataset of over 200 real-world patient questions posed to an LLM-based QA agent during surgical follow-up for cataract surgery - the highest volume operation in the world - augmented with clinician-selected questions for emergency, and clinical and non-clinical out-of-domain scenarios. We demonstrate that CF predicts human ratings of faithfulness more accurately than existing definitions in conversational settings. Furthermore, using eight different LLMs, we demonstrate that the three metrics can closely agree with human evaluations, highlighting the potential of these metrics for use in LLM-driven automated evaluation pipelines. Finally, we show that evaluation using our triad of CF, RA, and CR exhibits alignment with clinician assessment for inappropriate, harmful, or unhelpful responses. We also publish the prompts and datasets for these experiments, providing valuable resources for further research and development."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="he-etal-2025-astrid">
<titleInfo>
<title>ASTRID - An Automated and Scalable TRIaD for the Evaluation of RAG-based Clinical Question Answering Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yajie</namePart>
<namePart type="given">Vera</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohita</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jared</namePart>
<namePart type="family">Joselowitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aisling</namePart>
<namePart type="family">Higham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ernest</namePart>
<namePart type="family">Lim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) have shown impressive potential in clinical question answering (QA), with Retrieval Augmented Generation (RAG) emerging as a leading approach for ensuring the factual accuracy of model responses. However, current automated RAG metrics perform poorly in clinical and conversational use cases. Using clinical human evaluations of responses is expensive, unscalable, and not conducive to the continuous iterative development of RAG systems. To address these challenges, we introduce ASTRID - an Automated and Scalable TRIaD for evaluating clinical QA systems leveraging RAG - consisting of three metrics: Context Relevance (CR), Refusal Accuracy (RA), and Conversational Faithfulness (CF). Our novel evaluation metric, CF, is designed to better capture the faithfulness of a model’s response to the knowledge base without penalising conversational elements. Additionally, our metric RA captures the refusal to address questions outside of the system’s scope of practice. To validate our triad, we curate a dataset of over 200 real-world patient questions posed to an LLM-based QA agent during surgical follow-up for cataract surgery - the highest volume operation in the world - augmented with clinician-selected questions for emergency, and clinical and non-clinical out-of-domain scenarios. We demonstrate that CF predicts human ratings of faithfulness more accurately than existing definitions in conversational settings. Furthermore, using eight different LLMs, we demonstrate that the three metrics can closely agree with human evaluations, highlighting the potential of these metrics for use in LLM-driven automated evaluation pipelines. Finally, we show that evaluation using our triad of CF, RA, and CR exhibits alignment with clinician assessment for inappropriate, harmful, or unhelpful responses. We also publish the prompts and datasets for these experiments, providing valuable resources for further research and development.</abstract>
<identifier type="citekey">he-etal-2025-astrid</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.857</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.857/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>16700</start>
<end>16716</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ASTRID - An Automated and Scalable TRIaD for the Evaluation of RAG-based Clinical Question Answering Systems
%A He, Yajie Vera
%A Chowdhury, Mohita
%A Joselowitz, Jared
%A Higham, Aisling
%A Lim, Ernest
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F he-etal-2025-astrid
%X Large Language Models (LLMs) have shown impressive potential in clinical question answering (QA), with Retrieval Augmented Generation (RAG) emerging as a leading approach for ensuring the factual accuracy of model responses. However, current automated RAG metrics perform poorly in clinical and conversational use cases. Using clinical human evaluations of responses is expensive, unscalable, and not conducive to the continuous iterative development of RAG systems. To address these challenges, we introduce ASTRID - an Automated and Scalable TRIaD for evaluating clinical QA systems leveraging RAG - consisting of three metrics: Context Relevance (CR), Refusal Accuracy (RA), and Conversational Faithfulness (CF). Our novel evaluation metric, CF, is designed to better capture the faithfulness of a model’s response to the knowledge base without penalising conversational elements. Additionally, our metric RA captures the refusal to address questions outside of the system’s scope of practice. To validate our triad, we curate a dataset of over 200 real-world patient questions posed to an LLM-based QA agent during surgical follow-up for cataract surgery - the highest volume operation in the world - augmented with clinician-selected questions for emergency, and clinical and non-clinical out-of-domain scenarios. We demonstrate that CF predicts human ratings of faithfulness more accurately than existing definitions in conversational settings. Furthermore, using eight different LLMs, we demonstrate that the three metrics can closely agree with human evaluations, highlighting the potential of these metrics for use in LLM-driven automated evaluation pipelines. Finally, we show that evaluation using our triad of CF, RA, and CR exhibits alignment with clinician assessment for inappropriate, harmful, or unhelpful responses. We also publish the prompts and datasets for these experiments, providing valuable resources for further research and development.
%R 10.18653/v1/2025.findings-acl.857
%U https://aclanthology.org/2025.findings-acl.857/
%U https://doi.org/10.18653/v1/2025.findings-acl.857
%P 16700-16716
Markdown (Informal)
[ASTRID - An Automated and Scalable TRIaD for the Evaluation of RAG-based Clinical Question Answering Systems](https://aclanthology.org/2025.findings-acl.857/) (He et al., Findings 2025)
ACL