@article{chhun-etal-2024-language,
title = "Do Language Models Enjoy Their Own Stories? Prompting Large Language Models for Automatic Story Evaluation",
author = "Chhun, Cyril and
Suchanek, Fabian M. and
Clavel, Chlo{\'e}",
journal = "Transactions of the Association for Computational Linguistics",
volume = "12",
year = "2024",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2024.tacl-1.62/",
doi = "10.1162/tacl_a_00689",
pages = "1122--1142",
abstract = "Storytelling is an integral part of human experience and plays a crucial role in social interactions. Thus, Automatic Story Evaluation (ASE) and Generation (ASG) could benefit society in multiple ways, but they are challenging tasks which require high-level human abilities such as creativity, reasoning, and deep understanding. Meanwhile, Large Language Models (LLMs) now achieve state-of-the-art performance on many NLP tasks. In this paper, we study whether LLMs can be used as substitutes for human annotators for ASE. We perform an extensive analysis of the correlations between LLM ratings, other automatic measures, and human annotations, and we explore the influence of prompting on the results and the explainability of LLM behaviour. Most notably, we find that LLMs outperform current automatic measures for system-level evaluation but still struggle at providing satisfactory explanations for their answers."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chhun-etal-2024-language">
<titleInfo>
<title>Do Language Models Enjoy Their Own Stories? Prompting Large Language Models for Automatic Story Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cyril</namePart>
<namePart type="family">Chhun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabian</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Suchanek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chloé</namePart>
<namePart type="family">Clavel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Storytelling is an integral part of human experience and plays a crucial role in social interactions. Thus, Automatic Story Evaluation (ASE) and Generation (ASG) could benefit society in multiple ways, but they are challenging tasks which require high-level human abilities such as creativity, reasoning, and deep understanding. Meanwhile, Large Language Models (LLMs) now achieve state-of-the-art performance on many NLP tasks. In this paper, we study whether LLMs can be used as substitutes for human annotators for ASE. We perform an extensive analysis of the correlations between LLM ratings, other automatic measures, and human annotations, and we explore the influence of prompting on the results and the explainability of LLM behaviour. Most notably, we find that LLMs outperform current automatic measures for system-level evaluation but still struggle at providing satisfactory explanations for their answers.</abstract>
<identifier type="citekey">chhun-etal-2024-language</identifier>
<identifier type="doi">10.1162/tacl_a_00689</identifier>
<location>
<url>https://aclanthology.org/2024.tacl-1.62/</url>
</location>
<part>
<date>2024</date>
<detail type="volume"><number>12</number></detail>
<extent unit="page">
<start>1122</start>
<end>1142</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Do Language Models Enjoy Their Own Stories? Prompting Large Language Models for Automatic Story Evaluation
%A Chhun, Cyril
%A Suchanek, Fabian M.
%A Clavel, Chloé
%J Transactions of the Association for Computational Linguistics
%D 2024
%V 12
%I MIT Press
%C Cambridge, MA
%F chhun-etal-2024-language
%X Storytelling is an integral part of human experience and plays a crucial role in social interactions. Thus, Automatic Story Evaluation (ASE) and Generation (ASG) could benefit society in multiple ways, but they are challenging tasks which require high-level human abilities such as creativity, reasoning, and deep understanding. Meanwhile, Large Language Models (LLMs) now achieve state-of-the-art performance on many NLP tasks. In this paper, we study whether LLMs can be used as substitutes for human annotators for ASE. We perform an extensive analysis of the correlations between LLM ratings, other automatic measures, and human annotations, and we explore the influence of prompting on the results and the explainability of LLM behaviour. Most notably, we find that LLMs outperform current automatic measures for system-level evaluation but still struggle at providing satisfactory explanations for their answers.
%R 10.1162/tacl_a_00689
%U https://aclanthology.org/2024.tacl-1.62/
%U https://doi.org/10.1162/tacl_a_00689
%P 1122-1142
Markdown (Informal)
[Do Language Models Enjoy Their Own Stories? Prompting Large Language Models for Automatic Story Evaluation](https://aclanthology.org/2024.tacl-1.62/) (Chhun et al., TACL 2024)
ACL