@inproceedings{schiott-etal-2025-using,
title = "Using {LLM}s to Grade Clinical Reasoning for Medical Students in Virtual Patient Dialogues",
author = {Schi{\"o}tt, Jonathan and
Ivegren, William and
Borg, Alexander and
Parodis, Ioannis and
Skantze, Gabriel},
editor = "B{\'e}chet, Fr{\'e}d{\'e}ric and
Lef{\`e}vre, Fabrice and
Asher, Nicholas and
Kim, Seokhwan and
Merlin, Teva",
booktitle = "Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
month = aug,
year = "2025",
address = "Avignon, France",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sigdial-1.56/",
pages = "750--763",
abstract = "This paper presents an evaluation of the use of large language models (LLMs) for grading clinical reasoning during rheumatology medical history virtual patient (VP) simulations. The study explores the feasibility of using state-of-the-art LLMs, including both general-purpose models, with various prompting strategies such as zero-shot, analysis-first, and chain-of-thought prompting, as well as reasoning models. The performance of these models in grading transcribed dialogues from VP simulations conducted on a Furhat robot was evaluated against human expert annotations. Human experts initially achieved a 65{\%} inter-rater agreement, which resulted in a pooled Cohen{'}s Kappa of 0.71 and 82.3{\%} correctness. The best LLM, o3-mini, achieved a pooled Kappa of 0.68 and 81.5{\%} correctness, with response times under 30 seconds, compared to approximately 6 minutes for human grading. These results indicate the possibility that automatic assessments can approach human reliability under controlled simulation conditions while delivering time and cost efficiencies."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="schiott-etal-2025-using">
<titleInfo>
<title>Using LLMs to Grade Clinical Reasoning for Medical Students in Virtual Patient Dialogues</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="family">Schiött</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="family">Ivegren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Borg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ioannis</namePart>
<namePart type="family">Parodis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Skantze</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue</title>
</titleInfo>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabrice</namePart>
<namePart type="family">Lefèvre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Asher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Teva</namePart>
<namePart type="family">Merlin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Avignon, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents an evaluation of the use of large language models (LLMs) for grading clinical reasoning during rheumatology medical history virtual patient (VP) simulations. The study explores the feasibility of using state-of-the-art LLMs, including both general-purpose models, with various prompting strategies such as zero-shot, analysis-first, and chain-of-thought prompting, as well as reasoning models. The performance of these models in grading transcribed dialogues from VP simulations conducted on a Furhat robot was evaluated against human expert annotations. Human experts initially achieved a 65% inter-rater agreement, which resulted in a pooled Cohen’s Kappa of 0.71 and 82.3% correctness. The best LLM, o3-mini, achieved a pooled Kappa of 0.68 and 81.5% correctness, with response times under 30 seconds, compared to approximately 6 minutes for human grading. These results indicate the possibility that automatic assessments can approach human reliability under controlled simulation conditions while delivering time and cost efficiencies.</abstract>
<identifier type="citekey">schiott-etal-2025-using</identifier>
<location>
<url>https://aclanthology.org/2025.sigdial-1.56/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>750</start>
<end>763</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Using LLMs to Grade Clinical Reasoning for Medical Students in Virtual Patient Dialogues
%A Schiött, Jonathan
%A Ivegren, William
%A Borg, Alexander
%A Parodis, Ioannis
%A Skantze, Gabriel
%Y Béchet, Frédéric
%Y Lefèvre, Fabrice
%Y Asher, Nicholas
%Y Kim, Seokhwan
%Y Merlin, Teva
%S Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue
%D 2025
%8 August
%I Association for Computational Linguistics
%C Avignon, France
%F schiott-etal-2025-using
%X This paper presents an evaluation of the use of large language models (LLMs) for grading clinical reasoning during rheumatology medical history virtual patient (VP) simulations. The study explores the feasibility of using state-of-the-art LLMs, including both general-purpose models, with various prompting strategies such as zero-shot, analysis-first, and chain-of-thought prompting, as well as reasoning models. The performance of these models in grading transcribed dialogues from VP simulations conducted on a Furhat robot was evaluated against human expert annotations. Human experts initially achieved a 65% inter-rater agreement, which resulted in a pooled Cohen’s Kappa of 0.71 and 82.3% correctness. The best LLM, o3-mini, achieved a pooled Kappa of 0.68 and 81.5% correctness, with response times under 30 seconds, compared to approximately 6 minutes for human grading. These results indicate the possibility that automatic assessments can approach human reliability under controlled simulation conditions while delivering time and cost efficiencies.
%U https://aclanthology.org/2025.sigdial-1.56/
%P 750-763
Markdown (Informal)
[Using LLMs to Grade Clinical Reasoning for Medical Students in Virtual Patient Dialogues](https://aclanthology.org/2025.sigdial-1.56/) (Schiött et al., SIGDIAL 2025)
ACL