@inproceedings{arias-duart-etal-2025-automatic,
title = "Automatic Evaluation of Healthcare {LLM}s Beyond Question-Answering",
author = "Arias-Duart, Anna and
Martin-Torres, Pablo Agustin and
Hinjos, Daniel and
Bernabeu-Perez, Pablo and
Ganzabal, Lucia Urcelay and
Mallo, Marta Gonzalez and
Gururajan, Ashwin Kumar and
Lopez-Cuena, Enrique and
Alvarez-Napagao, Sergio and
Garcia-Gasulla, Dario",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-short.10/",
doi = "10.18653/v1/2025.naacl-short.10",
pages = "108--130",
ISBN = "979-8-89176-190-2",
abstract = "Current Large Language Models (LLMs) benchmarks are often based on open-ended or close-ended QA evaluations, avoiding the requirement of human labor. Close-ended measurements evaluate the factuality of responses but lack expressiveness. Open-ended capture the model{'}s capacity to produce discourse responses but are harder to assess for correctness. These two approaches are commonly used, either independently or together, though their relationship remains poorly understood. This work is focused on the healthcare domain, where both factuality and discourse matter greatly. It introduces a comprehensive, multi-axis suite for healthcare LLM evaluation, exploring correlations between open and close benchmarks and metrics. Findings include blind spots and overlaps in current methodologies. As an updated sanity check, we release a new medical benchmark{--}CareQA{--}, with both open and closed variants. Finally, we propose a novel metric for open-ended evaluations {--}Relaxed Perplexity{--} to mitigate the identified limitations."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arias-duart-etal-2025-automatic">
<titleInfo>
<title>Automatic Evaluation of Healthcare LLMs Beyond Question-Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Arias-Duart</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pablo</namePart>
<namePart type="given">Agustin</namePart>
<namePart type="family">Martin-Torres</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Hinjos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pablo</namePart>
<namePart type="family">Bernabeu-Perez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="given">Urcelay</namePart>
<namePart type="family">Ganzabal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="given">Gonzalez</namePart>
<namePart type="family">Mallo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashwin</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Gururajan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrique</namePart>
<namePart type="family">Lopez-Cuena</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergio</namePart>
<namePart type="family">Alvarez-Napagao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dario</namePart>
<namePart type="family">Garcia-Gasulla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-190-2</identifier>
</relatedItem>
<abstract>Current Large Language Models (LLMs) benchmarks are often based on open-ended or close-ended QA evaluations, avoiding the requirement of human labor. Close-ended measurements evaluate the factuality of responses but lack expressiveness. Open-ended capture the model’s capacity to produce discourse responses but are harder to assess for correctness. These two approaches are commonly used, either independently or together, though their relationship remains poorly understood. This work is focused on the healthcare domain, where both factuality and discourse matter greatly. It introduces a comprehensive, multi-axis suite for healthcare LLM evaluation, exploring correlations between open and close benchmarks and metrics. Findings include blind spots and overlaps in current methodologies. As an updated sanity check, we release a new medical benchmark–CareQA–, with both open and closed variants. Finally, we propose a novel metric for open-ended evaluations –Relaxed Perplexity– to mitigate the identified limitations.</abstract>
<identifier type="citekey">arias-duart-etal-2025-automatic</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-short.10</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-short.10/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>108</start>
<end>130</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automatic Evaluation of Healthcare LLMs Beyond Question-Answering
%A Arias-Duart, Anna
%A Martin-Torres, Pablo Agustin
%A Hinjos, Daniel
%A Bernabeu-Perez, Pablo
%A Ganzabal, Lucia Urcelay
%A Mallo, Marta Gonzalez
%A Gururajan, Ashwin Kumar
%A Lopez-Cuena, Enrique
%A Alvarez-Napagao, Sergio
%A Garcia-Gasulla, Dario
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-190-2
%F arias-duart-etal-2025-automatic
%X Current Large Language Models (LLMs) benchmarks are often based on open-ended or close-ended QA evaluations, avoiding the requirement of human labor. Close-ended measurements evaluate the factuality of responses but lack expressiveness. Open-ended capture the model’s capacity to produce discourse responses but are harder to assess for correctness. These two approaches are commonly used, either independently or together, though their relationship remains poorly understood. This work is focused on the healthcare domain, where both factuality and discourse matter greatly. It introduces a comprehensive, multi-axis suite for healthcare LLM evaluation, exploring correlations between open and close benchmarks and metrics. Findings include blind spots and overlaps in current methodologies. As an updated sanity check, we release a new medical benchmark–CareQA–, with both open and closed variants. Finally, we propose a novel metric for open-ended evaluations –Relaxed Perplexity– to mitigate the identified limitations.
%R 10.18653/v1/2025.naacl-short.10
%U https://aclanthology.org/2025.naacl-short.10/
%U https://doi.org/10.18653/v1/2025.naacl-short.10
%P 108-130
Markdown (Informal)
[Automatic Evaluation of Healthcare LLMs Beyond Question-Answering](https://aclanthology.org/2025.naacl-short.10/) (Arias-Duart et al., NAACL 2025)
ACL
- Anna Arias-Duart, Pablo Agustin Martin-Torres, Daniel Hinjos, Pablo Bernabeu-Perez, Lucia Urcelay Ganzabal, Marta Gonzalez Mallo, Ashwin Kumar Gururajan, Enrique Lopez-Cuena, Sergio Alvarez-Napagao, and Dario Garcia-Gasulla. 2025. Automatic Evaluation of Healthcare LLMs Beyond Question-Answering. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers), pages 108–130, Albuquerque, New Mexico. Association for Computational Linguistics.