@article{jones-etal-2024-comparing-humans,
title = "Comparing Humans and Large Language Models on an Experimental Protocol Inventory for Theory of Mind Evaluation ({EPITOME})",
author = "Jones, Cameron R. and
Trott, Sean and
Bergen, Benjamin",
journal = "Transactions of the Association for Computational Linguistics",
volume = "12",
year = "2024",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2024.tacl-1.45",
doi = "10.1162/tacl_a_00674",
pages = "803--819",
abstract = "We address a growing debate about the extent to which large language models (LLMs) produce behavior consistent with Theory of Mind (ToM) in humans. We present EPITOME: a battery of six experiments that tap diverse ToM capacities, including belief attribution, emotional inference, and pragmatic reasoning. We elicit a performance baseline from human participants for each task. We use the dataset to ask whether distributional linguistic information learned by LLMs is sufficient to explain ToM in humans. We compare performance of five LLMs to a baseline of responses from human comprehenders. Results are mixed. LLMs display considerable sensitivity to mental states and match human performance in several tasks. Yet, they commit systematic errors in others, especially those requiring pragmatic reasoning on the basis of mental state information. Such uneven performance indicates that human-level ToM may require resources beyond distributional information.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jones-etal-2024-comparing-humans">
<titleInfo>
<title>Comparing Humans and Large Language Models on an Experimental Protocol Inventory for Theory of Mind Evaluation (EPITOME)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cameron</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Jones</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sean</namePart>
<namePart type="family">Trott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Bergen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>We address a growing debate about the extent to which large language models (LLMs) produce behavior consistent with Theory of Mind (ToM) in humans. We present EPITOME: a battery of six experiments that tap diverse ToM capacities, including belief attribution, emotional inference, and pragmatic reasoning. We elicit a performance baseline from human participants for each task. We use the dataset to ask whether distributional linguistic information learned by LLMs is sufficient to explain ToM in humans. We compare performance of five LLMs to a baseline of responses from human comprehenders. Results are mixed. LLMs display considerable sensitivity to mental states and match human performance in several tasks. Yet, they commit systematic errors in others, especially those requiring pragmatic reasoning on the basis of mental state information. Such uneven performance indicates that human-level ToM may require resources beyond distributional information.</abstract>
<identifier type="citekey">jones-etal-2024-comparing-humans</identifier>
<identifier type="doi">10.1162/tacl_a_00674</identifier>
<location>
<url>https://aclanthology.org/2024.tacl-1.45</url>
</location>
<part>
<date>2024</date>
<detail type="volume"><number>12</number></detail>
<extent unit="page">
<start>803</start>
<end>819</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Comparing Humans and Large Language Models on an Experimental Protocol Inventory for Theory of Mind Evaluation (EPITOME)
%A Jones, Cameron R.
%A Trott, Sean
%A Bergen, Benjamin
%J Transactions of the Association for Computational Linguistics
%D 2024
%V 12
%I MIT Press
%C Cambridge, MA
%F jones-etal-2024-comparing-humans
%X We address a growing debate about the extent to which large language models (LLMs) produce behavior consistent with Theory of Mind (ToM) in humans. We present EPITOME: a battery of six experiments that tap diverse ToM capacities, including belief attribution, emotional inference, and pragmatic reasoning. We elicit a performance baseline from human participants for each task. We use the dataset to ask whether distributional linguistic information learned by LLMs is sufficient to explain ToM in humans. We compare performance of five LLMs to a baseline of responses from human comprehenders. Results are mixed. LLMs display considerable sensitivity to mental states and match human performance in several tasks. Yet, they commit systematic errors in others, especially those requiring pragmatic reasoning on the basis of mental state information. Such uneven performance indicates that human-level ToM may require resources beyond distributional information.
%R 10.1162/tacl_a_00674
%U https://aclanthology.org/2024.tacl-1.45
%U https://doi.org/10.1162/tacl_a_00674
%P 803-819
Markdown (Informal)
[Comparing Humans and Large Language Models on an Experimental Protocol Inventory for Theory of Mind Evaluation (EPITOME)](https://aclanthology.org/2024.tacl-1.45) (Jones et al., TACL 2024)
ACL