@inproceedings{montesano-etal-2026-mostly,
title = "Mostly Grounded, Occasionally Risky: Expert Evaluation of {LLM}-Generated Supervisory Feedback in a Psychotherapy Training Simulator",
author = "Montesano, Adrian and
Bloomberg, Justin and
P{\'e}rez-Buriel, Marc",
editor = "Zirikly, Aya and
Bar, Kfir and
MacAvaney, Sean and
Ireland, Molly and
Ophir, Yaakov and
Atzil-Slonim, Dana and
Varadarajan, Vasudha and
Bedrick, Steven and
Desmet, Bart",
booktitle = "Proceedings of the 10th Workshop on Computational Linguistics and Clinical Psychology ({CLP}sych 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.clpsych-1.24/",
pages = "298--305",
ISBN = "979-8-89176-421-7",
abstract = "Automated feedback is increasingly cited as a key advantage of AI-based psychotherapy training, yet the clinical groundedness of LLM-generated supervisory feedback remains unevaluated. We present an expert evaluation of supervisory feedback generated by PRACTICE, an LLM-powered open-ended psychotherapy training simulator, across 21 feedback instances from four novice trainees. Two clinical psychology experts independently coded 167 feedback propositions as Justified, Unjustified, or Unsure. Inter-rater reliability was near-perfect (raw agreement = 98.2{\textbackslash}{\%}; {\$}{\textbackslash}kappa{\$} = 0.902). Of the 167 propositions, 149 (89.2{\textbackslash}{\%}) were rated Justified; however, 52.4{\textbackslash}{\%} of feedback instances contained at least one non-justified proposition, and qualitative analysis identified three recurring failure types: incorrect characterization, referential imprecision, and unclear communication. In clinical training contexts, even low error rates carry ethical weight: unjustified feedback risks reinforcing inappropriate clinical behaviors in trainees that can be trasnferred to real practice. These findings provide an initial empirical basis for the responsible deployment of LLM-generated feedback in clinical training and call for traceable, expert-auditable feedback architectures."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="montesano-etal-2026-mostly">
<titleInfo>
<title>Mostly Grounded, Occasionally Risky: Expert Evaluation of LLM-Generated Supervisory Feedback in a Psychotherapy Training Simulator</title>
</titleInfo>
<name type="personal">
<namePart type="given">Adrian</namePart>
<namePart type="family">Montesano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Justin</namePart>
<namePart type="family">Bloomberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marc</namePart>
<namePart type="family">Pérez-Buriel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Workshop on Computational Linguistics and Clinical Psychology (CLPsych 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aya</namePart>
<namePart type="family">Zirikly</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kfir</namePart>
<namePart type="family">Bar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sean</namePart>
<namePart type="family">MacAvaney</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Molly</namePart>
<namePart type="family">Ireland</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaakov</namePart>
<namePart type="family">Ophir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dana</namePart>
<namePart type="family">Atzil-Slonim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vasudha</namePart>
<namePart type="family">Varadarajan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bedrick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bart</namePart>
<namePart type="family">Desmet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-421-7</identifier>
</relatedItem>
<abstract>Automated feedback is increasingly cited as a key advantage of AI-based psychotherapy training, yet the clinical groundedness of LLM-generated supervisory feedback remains unevaluated. We present an expert evaluation of supervisory feedback generated by PRACTICE, an LLM-powered open-ended psychotherapy training simulator, across 21 feedback instances from four novice trainees. Two clinical psychology experts independently coded 167 feedback propositions as Justified, Unjustified, or Unsure. Inter-rater reliability was near-perfect (raw agreement = 98.2\textbackslash%; $\textbackslashkappa$ = 0.902). Of the 167 propositions, 149 (89.2\textbackslash%) were rated Justified; however, 52.4\textbackslash% of feedback instances contained at least one non-justified proposition, and qualitative analysis identified three recurring failure types: incorrect characterization, referential imprecision, and unclear communication. In clinical training contexts, even low error rates carry ethical weight: unjustified feedback risks reinforcing inappropriate clinical behaviors in trainees that can be trasnferred to real practice. These findings provide an initial empirical basis for the responsible deployment of LLM-generated feedback in clinical training and call for traceable, expert-auditable feedback architectures.</abstract>
<identifier type="citekey">montesano-etal-2026-mostly</identifier>
<location>
<url>https://aclanthology.org/2026.clpsych-1.24/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>298</start>
<end>305</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mostly Grounded, Occasionally Risky: Expert Evaluation of LLM-Generated Supervisory Feedback in a Psychotherapy Training Simulator
%A Montesano, Adrian
%A Bloomberg, Justin
%A Pérez-Buriel, Marc
%Y Zirikly, Aya
%Y Bar, Kfir
%Y MacAvaney, Sean
%Y Ireland, Molly
%Y Ophir, Yaakov
%Y Atzil-Slonim, Dana
%Y Varadarajan, Vasudha
%Y Bedrick, Steven
%Y Desmet, Bart
%S Proceedings of the 10th Workshop on Computational Linguistics and Clinical Psychology (CLPsych 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-421-7
%F montesano-etal-2026-mostly
%X Automated feedback is increasingly cited as a key advantage of AI-based psychotherapy training, yet the clinical groundedness of LLM-generated supervisory feedback remains unevaluated. We present an expert evaluation of supervisory feedback generated by PRACTICE, an LLM-powered open-ended psychotherapy training simulator, across 21 feedback instances from four novice trainees. Two clinical psychology experts independently coded 167 feedback propositions as Justified, Unjustified, or Unsure. Inter-rater reliability was near-perfect (raw agreement = 98.2\textbackslash%; $\textbackslashkappa$ = 0.902). Of the 167 propositions, 149 (89.2\textbackslash%) were rated Justified; however, 52.4\textbackslash% of feedback instances contained at least one non-justified proposition, and qualitative analysis identified three recurring failure types: incorrect characterization, referential imprecision, and unclear communication. In clinical training contexts, even low error rates carry ethical weight: unjustified feedback risks reinforcing inappropriate clinical behaviors in trainees that can be trasnferred to real practice. These findings provide an initial empirical basis for the responsible deployment of LLM-generated feedback in clinical training and call for traceable, expert-auditable feedback architectures.
%U https://aclanthology.org/2026.clpsych-1.24/
%P 298-305
Markdown (Informal)
[Mostly Grounded, Occasionally Risky: Expert Evaluation of LLM-Generated Supervisory Feedback in a Psychotherapy Training Simulator](https://aclanthology.org/2026.clpsych-1.24/) (Montesano et al., CLPsych 2026)
ACL