@inproceedings{liu-etal-2025-interactive,
title = "Interactive Evaluation for Medical {LLM}s via Task-oriented Dialogue System",
author = "Liu, Ruoyu and
Xue, Kui and
Zhang, Xiaofan and
Zhang, Shaoting",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.325/",
pages = "4871--4896",
abstract = "This study focuses on evaluating proactive communication and diagnostic capabilities of medical Large Language Models (LLMs), which directly impact their effectiveness in patient consultations. In typical medical scenarios, doctors often ask a set of questions to gain a comprehensive understanding of patients' conditions. We argue that single-turn question-answering tasks such as MultiMedQA are insufficient for evaluating LLMs' medical consultation abilities. To address this limitation, we developed an evaluation benchmark called Multi-turn Medical Dialogue Evaluation (MMD-Eval), specifically designed to evaluate the proactive communication and diagnostic capabilities of medical LLMs during consultations. Considering the high cost and potential for hallucinations in LLMs, we innovatively trained a task-oriented dialogue system to simulate patients engaging in dialogues with the medical LLMs using our structured medical records dataset. This approach enabled us to generate multi-turn dialogue data. Subsequently, we evaluate the communication skills and medical expertise of the medical LLMs. All resources associated with this study will be made publicly available."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2025-interactive">
<titleInfo>
<title>Interactive Evaluation for Medical LLMs via Task-oriented Dialogue System</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruoyu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kui</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaofan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shaoting</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This study focuses on evaluating proactive communication and diagnostic capabilities of medical Large Language Models (LLMs), which directly impact their effectiveness in patient consultations. In typical medical scenarios, doctors often ask a set of questions to gain a comprehensive understanding of patients’ conditions. We argue that single-turn question-answering tasks such as MultiMedQA are insufficient for evaluating LLMs’ medical consultation abilities. To address this limitation, we developed an evaluation benchmark called Multi-turn Medical Dialogue Evaluation (MMD-Eval), specifically designed to evaluate the proactive communication and diagnostic capabilities of medical LLMs during consultations. Considering the high cost and potential for hallucinations in LLMs, we innovatively trained a task-oriented dialogue system to simulate patients engaging in dialogues with the medical LLMs using our structured medical records dataset. This approach enabled us to generate multi-turn dialogue data. Subsequently, we evaluate the communication skills and medical expertise of the medical LLMs. All resources associated with this study will be made publicly available.</abstract>
<identifier type="citekey">liu-etal-2025-interactive</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.325/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>4871</start>
<end>4896</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Interactive Evaluation for Medical LLMs via Task-oriented Dialogue System
%A Liu, Ruoyu
%A Xue, Kui
%A Zhang, Xiaofan
%A Zhang, Shaoting
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F liu-etal-2025-interactive
%X This study focuses on evaluating proactive communication and diagnostic capabilities of medical Large Language Models (LLMs), which directly impact their effectiveness in patient consultations. In typical medical scenarios, doctors often ask a set of questions to gain a comprehensive understanding of patients’ conditions. We argue that single-turn question-answering tasks such as MultiMedQA are insufficient for evaluating LLMs’ medical consultation abilities. To address this limitation, we developed an evaluation benchmark called Multi-turn Medical Dialogue Evaluation (MMD-Eval), specifically designed to evaluate the proactive communication and diagnostic capabilities of medical LLMs during consultations. Considering the high cost and potential for hallucinations in LLMs, we innovatively trained a task-oriented dialogue system to simulate patients engaging in dialogues with the medical LLMs using our structured medical records dataset. This approach enabled us to generate multi-turn dialogue data. Subsequently, we evaluate the communication skills and medical expertise of the medical LLMs. All resources associated with this study will be made publicly available.
%U https://aclanthology.org/2025.coling-main.325/
%P 4871-4896
Markdown (Informal)
[Interactive Evaluation for Medical LLMs via Task-oriented Dialogue System](https://aclanthology.org/2025.coling-main.325/) (Liu et al., COLING 2025)
ACL