@inproceedings{emerson-etal-2026-conversational,
title = "Conversational {AI} for Virtual Standardized Patients using a Speech-to-Speech {LLM}",
author = "Emerson, Andrew and
Evanini, Keelan and
Somay, Su and
Frome, Kevin and
Ha, Le An and
Harik, Polina",
editor = "Riccardi, Giuseppe and
Mousavi, Seyed Mahed and
Torres, Maria Ines and
Yoshino, Koichiro and
Callejas, Zoraida and
Chowdhury, Shammur Absar and
Chen, Yun-Nung and
Bechet, Frederic and
Gustafson, Joakim and
Damnati, G{\'e}raldine and
Papangelis, Alex and
D{'}Haro, Luis Fernando and
Mendon{\c{c}}a, John and
Bernardi, Raffaella and
Hakkani-Tur, Dilek and
Di Fabbrizio, Giuseppe {''}Pino{''} and
Kawahara, Tatsuya and
Alam, Firoj and
Tur, Gokhan and
Johnston, Michael",
booktitle = "Proceedings of the 16th International Workshop on Spoken Dialogue System Technology",
month = feb,
year = "2026",
address = "Trento, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.iwsds-1.16/",
pages = "142--152",
abstract = "To develop clinical reasoning skills, medical students are often tasked with interacting with trained standardized patients ({SP}s). Human {SP}s enable real conversations that can resemble authentic clinical scenarios. However, human {SP}s require extensive training and are often limited in their accessibility and continual availability to medical students or residents. Virtual {SP}s offer the ability for medical students to practice clinical interviews in a lower-stakes setting across a broader set of clinical cases. This paper introduces a virtual {SP} ({VSP}) that leverages {A}mazon{'}s Nova Sonic, a speech-to-speech foundation model designed for human-like conversation. We investigated the ability of Nova Sonic to portray four distinct clinical cases in virtual doctor-patient encounters with 20 third-year medical students. The system{'}s realism, its perceived learning value, and user experience were all assessed via a survey administered to the students. Students were also asked to compare this experience to interactions with a human {SP}. Survey results and conversations were analyzed to derive insights for improving the Nova Sonic-based {VSP} system."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="emerson-etal-2026-conversational">
<titleInfo>
<title>Conversational AI for Virtual Standardized Patients using a Speech-to-Speech LLM</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andrew</namePart>
<namePart type="family">Emerson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keelan</namePart>
<namePart type="family">Evanini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Su</namePart>
<namePart type="family">Somay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Frome</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Le</namePart>
<namePart type="given">An</namePart>
<namePart type="family">Ha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Polina</namePart>
<namePart type="family">Harik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-02</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th International Workshop on Spoken Dialogue System Technology</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Riccardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seyed</namePart>
<namePart type="given">Mahed</namePart>
<namePart type="family">Mousavi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Ines</namePart>
<namePart type="family">Torres</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Koichiro</namePart>
<namePart type="family">Yoshino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zoraida</namePart>
<namePart type="family">Callejas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shammur</namePart>
<namePart type="given">Absar</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frederic</namePart>
<namePart type="family">Bechet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joakim</namePart>
<namePart type="family">Gustafson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Géraldine</namePart>
<namePart type="family">Damnati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Papangelis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="given">Fernando</namePart>
<namePart type="family">D’Haro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Mendonça</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raffaella</namePart>
<namePart type="family">Bernardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dilek</namePart>
<namePart type="family">Hakkani-Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="given">”Pino”</namePart>
<namePart type="family">Di Fabbrizio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tatsuya</namePart>
<namePart type="family">Kawahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Firoj</namePart>
<namePart type="family">Alam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gokhan</namePart>
<namePart type="family">Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Johnston</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Trento, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>To develop clinical reasoning skills, medical students are often tasked with interacting with trained standardized patients (SPs). Human SPs enable real conversations that can resemble authentic clinical scenarios. However, human SPs require extensive training and are often limited in their accessibility and continual availability to medical students or residents. Virtual SPs offer the ability for medical students to practice clinical interviews in a lower-stakes setting across a broader set of clinical cases. This paper introduces a virtual SP (VSP) that leverages Amazon’s Nova Sonic, a speech-to-speech foundation model designed for human-like conversation. We investigated the ability of Nova Sonic to portray four distinct clinical cases in virtual doctor-patient encounters with 20 third-year medical students. The system’s realism, its perceived learning value, and user experience were all assessed via a survey administered to the students. Students were also asked to compare this experience to interactions with a human SP. Survey results and conversations were analyzed to derive insights for improving the Nova Sonic-based VSP system.</abstract>
<identifier type="citekey">emerson-etal-2026-conversational</identifier>
<location>
<url>https://aclanthology.org/2026.iwsds-1.16/</url>
</location>
<part>
<date>2026-02</date>
<extent unit="page">
<start>142</start>
<end>152</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Conversational AI for Virtual Standardized Patients using a Speech-to-Speech LLM
%A Emerson, Andrew
%A Evanini, Keelan
%A Somay, Su
%A Frome, Kevin
%A Ha, Le An
%A Harik, Polina
%Y Riccardi, Giuseppe
%Y Mousavi, Seyed Mahed
%Y Torres, Maria Ines
%Y Yoshino, Koichiro
%Y Callejas, Zoraida
%Y Chowdhury, Shammur Absar
%Y Chen, Yun-Nung
%Y Bechet, Frederic
%Y Gustafson, Joakim
%Y Damnati, Géraldine
%Y Papangelis, Alex
%Y D’Haro, Luis Fernando
%Y Mendonça, John
%Y Bernardi, Raffaella
%Y Hakkani-Tur, Dilek
%Y Di Fabbrizio, Giuseppe ”Pino”
%Y Kawahara, Tatsuya
%Y Alam, Firoj
%Y Tur, Gokhan
%Y Johnston, Michael
%S Proceedings of the 16th International Workshop on Spoken Dialogue System Technology
%D 2026
%8 February
%I Association for Computational Linguistics
%C Trento, Italy
%F emerson-etal-2026-conversational
%X To develop clinical reasoning skills, medical students are often tasked with interacting with trained standardized patients (SPs). Human SPs enable real conversations that can resemble authentic clinical scenarios. However, human SPs require extensive training and are often limited in their accessibility and continual availability to medical students or residents. Virtual SPs offer the ability for medical students to practice clinical interviews in a lower-stakes setting across a broader set of clinical cases. This paper introduces a virtual SP (VSP) that leverages Amazon’s Nova Sonic, a speech-to-speech foundation model designed for human-like conversation. We investigated the ability of Nova Sonic to portray four distinct clinical cases in virtual doctor-patient encounters with 20 third-year medical students. The system’s realism, its perceived learning value, and user experience were all assessed via a survey administered to the students. Students were also asked to compare this experience to interactions with a human SP. Survey results and conversations were analyzed to derive insights for improving the Nova Sonic-based VSP system.
%U https://aclanthology.org/2026.iwsds-1.16/
%P 142-152
Markdown (Informal)
[Conversational AI for Virtual Standardized Patients using a Speech-to-Speech LLM](https://aclanthology.org/2026.iwsds-1.16/) (Emerson et al., IWSDS 2026)
ACL