@inproceedings{marcinek-etal-2025-role,
title = "Role of Reasoning in {LLM} Enjoyment Detection: Evaluation Across Conversational Levels for Human-Robot Interaction",
author = "Marcinek, Lubos and
Irfan, Bahar and
Skantze, Gabriel and
Pereira, Andre and
Gustafsson, Joakim",
editor = "B{\'e}chet, Fr{\'e}d{\'e}ric and
Lef{\`e}vre, Fabrice and
Asher, Nicholas and
Kim, Seokhwan and
Merlin, Teva",
booktitle = "Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
month = aug,
year = "2025",
address = "Avignon, France",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sigdial-1.46/",
pages = "573--590",
abstract = "User enjoyment is central to developing conversational AI systems that can recover from failures and maintain interest over time. However, existing approaches often struggle to detect subtle cues that reflect user experience. Large Language Models (LLMs) with reasoning capabilities have outperformed standard models on various other tasks, suggesting potential benefits for enjoyment detection. This study investigates whether models with reasoning capabilities outperform standard models when assessing enjoyment in a human-robot dialogue corpus at both turn and interaction levels. Results indicate that reasoning capabilities have complex, model-dependent effects rather than universal benefits. While performance was nearly identical at the interaction level (0.44 vs 0.43), reasoning models substantially outperformed at the turn level (0.42 vs 0.36). Notably, LLMs correlated better with users' self-reported enjoyment metrics than human annotators, despite achieving lower accuracy against human consensus ratings. Analysis revealed distinctive error patterns: non-reasoning models showed bias toward positive ratings at the turn level, while both model types exhibited central tendency bias at the interaction level. These findings suggest that reasoning should be applied selectively based on model architecture and assessment context, with assessment granularity significantly influencing relative effectiveness."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="marcinek-etal-2025-role">
<titleInfo>
<title>Role of Reasoning in LLM Enjoyment Detection: Evaluation Across Conversational Levels for Human-Robot Interaction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lubos</namePart>
<namePart type="family">Marcinek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bahar</namePart>
<namePart type="family">Irfan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Skantze</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Pereira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joakim</namePart>
<namePart type="family">Gustafsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue</title>
</titleInfo>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabrice</namePart>
<namePart type="family">Lefèvre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Asher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Teva</namePart>
<namePart type="family">Merlin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Avignon, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>User enjoyment is central to developing conversational AI systems that can recover from failures and maintain interest over time. However, existing approaches often struggle to detect subtle cues that reflect user experience. Large Language Models (LLMs) with reasoning capabilities have outperformed standard models on various other tasks, suggesting potential benefits for enjoyment detection. This study investigates whether models with reasoning capabilities outperform standard models when assessing enjoyment in a human-robot dialogue corpus at both turn and interaction levels. Results indicate that reasoning capabilities have complex, model-dependent effects rather than universal benefits. While performance was nearly identical at the interaction level (0.44 vs 0.43), reasoning models substantially outperformed at the turn level (0.42 vs 0.36). Notably, LLMs correlated better with users’ self-reported enjoyment metrics than human annotators, despite achieving lower accuracy against human consensus ratings. Analysis revealed distinctive error patterns: non-reasoning models showed bias toward positive ratings at the turn level, while both model types exhibited central tendency bias at the interaction level. These findings suggest that reasoning should be applied selectively based on model architecture and assessment context, with assessment granularity significantly influencing relative effectiveness.</abstract>
<identifier type="citekey">marcinek-etal-2025-role</identifier>
<location>
<url>https://aclanthology.org/2025.sigdial-1.46/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>573</start>
<end>590</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Role of Reasoning in LLM Enjoyment Detection: Evaluation Across Conversational Levels for Human-Robot Interaction
%A Marcinek, Lubos
%A Irfan, Bahar
%A Skantze, Gabriel
%A Pereira, Andre
%A Gustafsson, Joakim
%Y Béchet, Frédéric
%Y Lefèvre, Fabrice
%Y Asher, Nicholas
%Y Kim, Seokhwan
%Y Merlin, Teva
%S Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue
%D 2025
%8 August
%I Association for Computational Linguistics
%C Avignon, France
%F marcinek-etal-2025-role
%X User enjoyment is central to developing conversational AI systems that can recover from failures and maintain interest over time. However, existing approaches often struggle to detect subtle cues that reflect user experience. Large Language Models (LLMs) with reasoning capabilities have outperformed standard models on various other tasks, suggesting potential benefits for enjoyment detection. This study investigates whether models with reasoning capabilities outperform standard models when assessing enjoyment in a human-robot dialogue corpus at both turn and interaction levels. Results indicate that reasoning capabilities have complex, model-dependent effects rather than universal benefits. While performance was nearly identical at the interaction level (0.44 vs 0.43), reasoning models substantially outperformed at the turn level (0.42 vs 0.36). Notably, LLMs correlated better with users’ self-reported enjoyment metrics than human annotators, despite achieving lower accuracy against human consensus ratings. Analysis revealed distinctive error patterns: non-reasoning models showed bias toward positive ratings at the turn level, while both model types exhibited central tendency bias at the interaction level. These findings suggest that reasoning should be applied selectively based on model architecture and assessment context, with assessment granularity significantly influencing relative effectiveness.
%U https://aclanthology.org/2025.sigdial-1.46/
%P 573-590
Markdown (Informal)
[Role of Reasoning in LLM Enjoyment Detection: Evaluation Across Conversational Levels for Human-Robot Interaction](https://aclanthology.org/2025.sigdial-1.46/) (Marcinek et al., SIGDIAL 2025)
ACL