@inproceedings{harel-canada-etal-2024-measuring,
title = "Measuring Psychological Depth in Language Models",
author = "Harel-Canada, Fabrice Y and
Zhou, Hanyu and
Muppalla, Sreya and
Yildiz, Zeynep Senahan and
Kim, Miryung and
Sahai, Amit and
Peng, Nanyun",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.953/",
doi = "10.18653/v1/2024.emnlp-main.953",
pages = "17162--17196",
abstract = "Evaluations of creative stories generated by large language models (LLMs) often focus on objective properties of the text, such as its style, coherence, and diversity. While these metrics are indispensable, they do not speak to a story{'}s subjective, psychological impact from a reader{'}s perspective. We introduce the Psychological Depth Scale (PDS), a novel framework rooted in literary theory that measures an LLM{'}s ability to produce authentic and narratively complex stories that provoke emotion, empathy, and engagement. We empirically validate our framework by showing that humans can consistently evaluate stories based on PDS (0.72 Krippendorff{'}s alpha). We also explore techniques for automating the PDS to easily scale future analyses. GPT-4o, combined with a novel Mixture-of-Personas (MoP) prompting strategy, achieves an average Spearman correlation of 0.51 with human judgment while Llama-3-70B with constrained decoding scores as high as 0.68 for empathy. Finally, we compared the depth of stories authored by both humans and LLMs. Surprisingly, GPT-4 stories either surpassed or were statistically indistinguishable from highly-rated human-written stories sourced from Reddit. By shifting the focus from text to reader, the Psychological Depth Scale is a validated, automated, and systematic means of measuring the capacity of LLMs to connect with humans through the stories they tell."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="harel-canada-etal-2024-measuring">
<titleInfo>
<title>Measuring Psychological Depth in Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fabrice</namePart>
<namePart type="given">Y</namePart>
<namePart type="family">Harel-Canada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanyu</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sreya</namePart>
<namePart type="family">Muppalla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeynep</namePart>
<namePart type="given">Senahan</namePart>
<namePart type="family">Yildiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miryung</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amit</namePart>
<namePart type="family">Sahai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nanyun</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Evaluations of creative stories generated by large language models (LLMs) often focus on objective properties of the text, such as its style, coherence, and diversity. While these metrics are indispensable, they do not speak to a story’s subjective, psychological impact from a reader’s perspective. We introduce the Psychological Depth Scale (PDS), a novel framework rooted in literary theory that measures an LLM’s ability to produce authentic and narratively complex stories that provoke emotion, empathy, and engagement. We empirically validate our framework by showing that humans can consistently evaluate stories based on PDS (0.72 Krippendorff’s alpha). We also explore techniques for automating the PDS to easily scale future analyses. GPT-4o, combined with a novel Mixture-of-Personas (MoP) prompting strategy, achieves an average Spearman correlation of 0.51 with human judgment while Llama-3-70B with constrained decoding scores as high as 0.68 for empathy. Finally, we compared the depth of stories authored by both humans and LLMs. Surprisingly, GPT-4 stories either surpassed or were statistically indistinguishable from highly-rated human-written stories sourced from Reddit. By shifting the focus from text to reader, the Psychological Depth Scale is a validated, automated, and systematic means of measuring the capacity of LLMs to connect with humans through the stories they tell.</abstract>
<identifier type="citekey">harel-canada-etal-2024-measuring</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.953</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.953/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>17162</start>
<end>17196</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Measuring Psychological Depth in Language Models
%A Harel-Canada, Fabrice Y.
%A Zhou, Hanyu
%A Muppalla, Sreya
%A Yildiz, Zeynep Senahan
%A Kim, Miryung
%A Sahai, Amit
%A Peng, Nanyun
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F harel-canada-etal-2024-measuring
%X Evaluations of creative stories generated by large language models (LLMs) often focus on objective properties of the text, such as its style, coherence, and diversity. While these metrics are indispensable, they do not speak to a story’s subjective, psychological impact from a reader’s perspective. We introduce the Psychological Depth Scale (PDS), a novel framework rooted in literary theory that measures an LLM’s ability to produce authentic and narratively complex stories that provoke emotion, empathy, and engagement. We empirically validate our framework by showing that humans can consistently evaluate stories based on PDS (0.72 Krippendorff’s alpha). We also explore techniques for automating the PDS to easily scale future analyses. GPT-4o, combined with a novel Mixture-of-Personas (MoP) prompting strategy, achieves an average Spearman correlation of 0.51 with human judgment while Llama-3-70B with constrained decoding scores as high as 0.68 for empathy. Finally, we compared the depth of stories authored by both humans and LLMs. Surprisingly, GPT-4 stories either surpassed or were statistically indistinguishable from highly-rated human-written stories sourced from Reddit. By shifting the focus from text to reader, the Psychological Depth Scale is a validated, automated, and systematic means of measuring the capacity of LLMs to connect with humans through the stories they tell.
%R 10.18653/v1/2024.emnlp-main.953
%U https://aclanthology.org/2024.emnlp-main.953/
%U https://doi.org/10.18653/v1/2024.emnlp-main.953
%P 17162-17196
Markdown (Informal)
[Measuring Psychological Depth in Language Models](https://aclanthology.org/2024.emnlp-main.953/) (Harel-Canada et al., EMNLP 2024)
ACL
- Fabrice Y Harel-Canada, Hanyu Zhou, Sreya Muppalla, Zeynep Senahan Yildiz, Miryung Kim, Amit Sahai, and Nanyun Peng. 2024. Measuring Psychological Depth in Language Models. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 17162–17196, Miami, Florida, USA. Association for Computational Linguistics.