@inproceedings{akoury-etal-2023-framework,
title = "A Framework for Exploring Player Perceptions of {LLM}-Generated Dialogue in Commercial Video Games",
author = "Akoury, Nader and
Yang, Qian and
Iyyer, Mohit",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.151",
doi = "10.18653/v1/2023.findings-emnlp.151",
pages = "2295--2311",
abstract = "The growing capabilities of large language models (LLMs) have inspired recent efforts to integrate LLM-generated dialogue into video games. However, evaluation remains a major challenge: how do we assess the player experience in a commercial game augmented with LLM-generated dialogue? To explore this question, we introduce a dynamic evaluation framework for the dialogue management systems that govern the task-oriented dialogue often found in roleplaying video games. We first extract dialogue from the widely-acclaimed role-playing game *Disco Elysium: The Final Cut*, which contains 1.1M words of dialogue spread across a complex graph of utterances where node reachability depends on game state (e.g., whether a certain item is held). Using this dataset, we have GPT-4 perform *dialogue infilling* to generate grounded utterances based on game state represented via code. In a statistically robust study of 28 players recruited from the r/DiscoyElysium subreddit, the LLM outputs are evaluated against the game designers{'} writing via both preference judgments and free-form feedback using a web interface that recreates the game{'}s core conversation functionality. Overall, the game designers{'} prose is significantly preferred to GPT-4 generations, with participants citing reasons such as improved logical flow and grounding with the game state. To spur more principled future research in this area, we release our web interface and tools to enable researchers to build upon our work. https://pl.aiwright.dev",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="akoury-etal-2023-framework">
<titleInfo>
<title>A Framework for Exploring Player Perceptions of LLM-Generated Dialogue in Commercial Video Games</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nader</namePart>
<namePart type="family">Akoury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qian</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Iyyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The growing capabilities of large language models (LLMs) have inspired recent efforts to integrate LLM-generated dialogue into video games. However, evaluation remains a major challenge: how do we assess the player experience in a commercial game augmented with LLM-generated dialogue? To explore this question, we introduce a dynamic evaluation framework for the dialogue management systems that govern the task-oriented dialogue often found in roleplaying video games. We first extract dialogue from the widely-acclaimed role-playing game *Disco Elysium: The Final Cut*, which contains 1.1M words of dialogue spread across a complex graph of utterances where node reachability depends on game state (e.g., whether a certain item is held). Using this dataset, we have GPT-4 perform *dialogue infilling* to generate grounded utterances based on game state represented via code. In a statistically robust study of 28 players recruited from the r/DiscoyElysium subreddit, the LLM outputs are evaluated against the game designers’ writing via both preference judgments and free-form feedback using a web interface that recreates the game’s core conversation functionality. Overall, the game designers’ prose is significantly preferred to GPT-4 generations, with participants citing reasons such as improved logical flow and grounding with the game state. To spur more principled future research in this area, we release our web interface and tools to enable researchers to build upon our work. https://pl.aiwright.dev</abstract>
<identifier type="citekey">akoury-etal-2023-framework</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.151</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.151</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>2295</start>
<end>2311</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Framework for Exploring Player Perceptions of LLM-Generated Dialogue in Commercial Video Games
%A Akoury, Nader
%A Yang, Qian
%A Iyyer, Mohit
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F akoury-etal-2023-framework
%X The growing capabilities of large language models (LLMs) have inspired recent efforts to integrate LLM-generated dialogue into video games. However, evaluation remains a major challenge: how do we assess the player experience in a commercial game augmented with LLM-generated dialogue? To explore this question, we introduce a dynamic evaluation framework for the dialogue management systems that govern the task-oriented dialogue often found in roleplaying video games. We first extract dialogue from the widely-acclaimed role-playing game *Disco Elysium: The Final Cut*, which contains 1.1M words of dialogue spread across a complex graph of utterances where node reachability depends on game state (e.g., whether a certain item is held). Using this dataset, we have GPT-4 perform *dialogue infilling* to generate grounded utterances based on game state represented via code. In a statistically robust study of 28 players recruited from the r/DiscoyElysium subreddit, the LLM outputs are evaluated against the game designers’ writing via both preference judgments and free-form feedback using a web interface that recreates the game’s core conversation functionality. Overall, the game designers’ prose is significantly preferred to GPT-4 generations, with participants citing reasons such as improved logical flow and grounding with the game state. To spur more principled future research in this area, we release our web interface and tools to enable researchers to build upon our work. https://pl.aiwright.dev
%R 10.18653/v1/2023.findings-emnlp.151
%U https://aclanthology.org/2023.findings-emnlp.151
%U https://doi.org/10.18653/v1/2023.findings-emnlp.151
%P 2295-2311
Markdown (Informal)
[A Framework for Exploring Player Perceptions of LLM-Generated Dialogue in Commercial Video Games](https://aclanthology.org/2023.findings-emnlp.151) (Akoury et al., Findings 2023)
ACL