@inproceedings{wang-etal-2025-automatic,
title = "Automatic Scoring of an Open-Response Measure of Advanced Mind-Reading Using Large Language Models",
author = "Wang, Yixiao and
Dsouza, Russel and
Lee, Robert and
Apperly, Ian and
Devine, Rory and
van der Kleij, Sanne and
Lee, Mark",
editor = "Zirikly, Ayah and
Yates, Andrew and
Desmet, Bart and
Ireland, Molly and
Bedrick, Steven and
MacAvaney, Sean and
Bar, Kfir and
Ophir, Yaakov",
booktitle = "Proceedings of the 10th Workshop on Computational Linguistics and Clinical Psychology (CLPsych 2025)",
month = may,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.clpsych-1.7/",
doi = "10.18653/v1/2025.clpsych-1.7",
pages = "79--89",
ISBN = "979-8-89176-226-8",
abstract = "A rigorous psychometric approach is crucial for the accurate measurement of mind-reading abilities. Traditional scoring methods for such tests, which involve lengthy free-text responses, require considerable time and human effort. This study investigates the use of large language models (LLMs) to automate the scoring of psychometric tests. Data were collected from participants aged 13 to 30 years and scored by trained human coders to establish a benchmark. We evaluated multiple LLMs against human assessments, exploring various prompting strate- gies to optimize performance and fine-tuning the models using a subset of the collected data to enhance accuracy. Our results demonstrate that LLMs can assess advanced mind-reading abilities with over 90{\%} accuracy on average. Notably, in most test items, the LLMs achieved higher Kappa agreement with the lead coder than two trained human coders, highlighting their potential to reliably score open-response psychometric tests."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2025-automatic">
<titleInfo>
<title>Automatic Scoring of an Open-Response Measure of Advanced Mind-Reading Using Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yixiao</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Russel</namePart>
<namePart type="family">Dsouza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ian</namePart>
<namePart type="family">Apperly</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rory</namePart>
<namePart type="family">Devine</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sanne</namePart>
<namePart type="family">van der Kleij</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Workshop on Computational Linguistics and Clinical Psychology (CLPsych 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ayah</namePart>
<namePart type="family">Zirikly</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrew</namePart>
<namePart type="family">Yates</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bart</namePart>
<namePart type="family">Desmet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Molly</namePart>
<namePart type="family">Ireland</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bedrick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sean</namePart>
<namePart type="family">MacAvaney</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kfir</namePart>
<namePart type="family">Bar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaakov</namePart>
<namePart type="family">Ophir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-226-8</identifier>
</relatedItem>
<abstract>A rigorous psychometric approach is crucial for the accurate measurement of mind-reading abilities. Traditional scoring methods for such tests, which involve lengthy free-text responses, require considerable time and human effort. This study investigates the use of large language models (LLMs) to automate the scoring of psychometric tests. Data were collected from participants aged 13 to 30 years and scored by trained human coders to establish a benchmark. We evaluated multiple LLMs against human assessments, exploring various prompting strate- gies to optimize performance and fine-tuning the models using a subset of the collected data to enhance accuracy. Our results demonstrate that LLMs can assess advanced mind-reading abilities with over 90% accuracy on average. Notably, in most test items, the LLMs achieved higher Kappa agreement with the lead coder than two trained human coders, highlighting their potential to reliably score open-response psychometric tests.</abstract>
<identifier type="citekey">wang-etal-2025-automatic</identifier>
<identifier type="doi">10.18653/v1/2025.clpsych-1.7</identifier>
<location>
<url>https://aclanthology.org/2025.clpsych-1.7/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>79</start>
<end>89</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automatic Scoring of an Open-Response Measure of Advanced Mind-Reading Using Large Language Models
%A Wang, Yixiao
%A Dsouza, Russel
%A Lee, Robert
%A Apperly, Ian
%A Devine, Rory
%A van der Kleij, Sanne
%A Lee, Mark
%Y Zirikly, Ayah
%Y Yates, Andrew
%Y Desmet, Bart
%Y Ireland, Molly
%Y Bedrick, Steven
%Y MacAvaney, Sean
%Y Bar, Kfir
%Y Ophir, Yaakov
%S Proceedings of the 10th Workshop on Computational Linguistics and Clinical Psychology (CLPsych 2025)
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-226-8
%F wang-etal-2025-automatic
%X A rigorous psychometric approach is crucial for the accurate measurement of mind-reading abilities. Traditional scoring methods for such tests, which involve lengthy free-text responses, require considerable time and human effort. This study investigates the use of large language models (LLMs) to automate the scoring of psychometric tests. Data were collected from participants aged 13 to 30 years and scored by trained human coders to establish a benchmark. We evaluated multiple LLMs against human assessments, exploring various prompting strate- gies to optimize performance and fine-tuning the models using a subset of the collected data to enhance accuracy. Our results demonstrate that LLMs can assess advanced mind-reading abilities with over 90% accuracy on average. Notably, in most test items, the LLMs achieved higher Kappa agreement with the lead coder than two trained human coders, highlighting their potential to reliably score open-response psychometric tests.
%R 10.18653/v1/2025.clpsych-1.7
%U https://aclanthology.org/2025.clpsych-1.7/
%U https://doi.org/10.18653/v1/2025.clpsych-1.7
%P 79-89
Markdown (Informal)
[Automatic Scoring of an Open-Response Measure of Advanced Mind-Reading Using Large Language Models](https://aclanthology.org/2025.clpsych-1.7/) (Wang et al., CLPsych 2025)
ACL