@inproceedings{lohn-etal-2024-machine-psychology,
title = "Is Machine Psychology here? On Requirements for Using Human Psychological Tests on Large Language Models",
author = {L{\"o}hn, Lea and
Kiehne, Niklas and
Ljapunov, Alexander and
Balke, Wolf-Tilo},
editor = "Mahamood, Saad and
Minh, Nguyen Le and
Ippolito, Daphne",
booktitle = "Proceedings of the 17th International Natural Language Generation Conference",
month = sep,
year = "2024",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.inlg-main.19",
pages = "230--242",
abstract = "In an effort to better understand the behavior of large language models (LLM), researchers recently turned to conducting psychological assessments on them. Several studies diagnose various psychological concepts in LLMs, such as psychopathological symptoms, personality traits, and intellectual functioning, aiming to unravel their black-box characteristics. But can we safely assess LLMs with tests that were originally designed for humans? The psychology domain looks back on decades of developing standards of appropriate testing procedures to ensure reliable and valid measures. We argue that analogous standardization processes are required for LLM assessments, given their differential functioning as compared to humans. In this paper, we propose seven requirements necessary for testing LLMs. Based on these, we critically reflect a sample of 25 recent machine psychology studies. Our analysis reveals (1) the lack of appropriate methods to assess test reliability and construct validity, (2) the unknown strength of construct-irrelevant influences, such as the contamination of pre-training corpora with test material, and (3) the pervasive issue of non-reproducibility of many studies. The results underscore the lack of a general methodology for the implementation of psychological assessments of LLMs and the need to redefine psychological constructs specifically for large language models rather than adopting them from human psychology.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lohn-etal-2024-machine-psychology">
<titleInfo>
<title>Is Machine Psychology here? On Requirements for Using Human Psychological Tests on Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lea</namePart>
<namePart type="family">Löhn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niklas</namePart>
<namePart type="family">Kiehne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Ljapunov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wolf-Tilo</namePart>
<namePart type="family">Balke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Natural Language Generation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saad</namePart>
<namePart type="family">Mahamood</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nguyen</namePart>
<namePart type="given">Le</namePart>
<namePart type="family">Minh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daphne</namePart>
<namePart type="family">Ippolito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Tokyo, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In an effort to better understand the behavior of large language models (LLM), researchers recently turned to conducting psychological assessments on them. Several studies diagnose various psychological concepts in LLMs, such as psychopathological symptoms, personality traits, and intellectual functioning, aiming to unravel their black-box characteristics. But can we safely assess LLMs with tests that were originally designed for humans? The psychology domain looks back on decades of developing standards of appropriate testing procedures to ensure reliable and valid measures. We argue that analogous standardization processes are required for LLM assessments, given their differential functioning as compared to humans. In this paper, we propose seven requirements necessary for testing LLMs. Based on these, we critically reflect a sample of 25 recent machine psychology studies. Our analysis reveals (1) the lack of appropriate methods to assess test reliability and construct validity, (2) the unknown strength of construct-irrelevant influences, such as the contamination of pre-training corpora with test material, and (3) the pervasive issue of non-reproducibility of many studies. The results underscore the lack of a general methodology for the implementation of psychological assessments of LLMs and the need to redefine psychological constructs specifically for large language models rather than adopting them from human psychology.</abstract>
<identifier type="citekey">lohn-etal-2024-machine-psychology</identifier>
<location>
<url>https://aclanthology.org/2024.inlg-main.19</url>
</location>
<part>
<date>2024-09</date>
<extent unit="page">
<start>230</start>
<end>242</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Is Machine Psychology here? On Requirements for Using Human Psychological Tests on Large Language Models
%A Löhn, Lea
%A Kiehne, Niklas
%A Ljapunov, Alexander
%A Balke, Wolf-Tilo
%Y Mahamood, Saad
%Y Minh, Nguyen Le
%Y Ippolito, Daphne
%S Proceedings of the 17th International Natural Language Generation Conference
%D 2024
%8 September
%I Association for Computational Linguistics
%C Tokyo, Japan
%F lohn-etal-2024-machine-psychology
%X In an effort to better understand the behavior of large language models (LLM), researchers recently turned to conducting psychological assessments on them. Several studies diagnose various psychological concepts in LLMs, such as psychopathological symptoms, personality traits, and intellectual functioning, aiming to unravel their black-box characteristics. But can we safely assess LLMs with tests that were originally designed for humans? The psychology domain looks back on decades of developing standards of appropriate testing procedures to ensure reliable and valid measures. We argue that analogous standardization processes are required for LLM assessments, given their differential functioning as compared to humans. In this paper, we propose seven requirements necessary for testing LLMs. Based on these, we critically reflect a sample of 25 recent machine psychology studies. Our analysis reveals (1) the lack of appropriate methods to assess test reliability and construct validity, (2) the unknown strength of construct-irrelevant influences, such as the contamination of pre-training corpora with test material, and (3) the pervasive issue of non-reproducibility of many studies. The results underscore the lack of a general methodology for the implementation of psychological assessments of LLMs and the need to redefine psychological constructs specifically for large language models rather than adopting them from human psychology.
%U https://aclanthology.org/2024.inlg-main.19
%P 230-242
Markdown (Informal)
[Is Machine Psychology here? On Requirements for Using Human Psychological Tests on Large Language Models](https://aclanthology.org/2024.inlg-main.19) (Löhn et al., INLG 2024)
ACL