@inproceedings{nakaguro-yoshino-2026-exploring,
title = "Exploring Emotional Nuances in Spoken Dialogue: Dataset Construction and Prediction of Emotional Dialogue Breakdown",
author = "Nakaguro, Hyuga and
Yoshino, Koichiro",
editor = "Riccardi, Giuseppe and
Mousavi, Seyed Mahed and
Torres, Maria Ines and
Yoshino, Koichiro and
Callejas, Zoraida and
Chowdhury, Shammur Absar and
Chen, Yun-Nung and
Bechet, Frederic and
Gustafson, Joakim and
Damnati, G{\'e}raldine and
Papangelis, Alex and
D{'}Haro, Luis Fernando and
Mendon{\c{c}}a, John and
Bernardi, Raffaella and
Hakkani-Tur, Dilek and
Di Fabbrizio, Giuseppe {''}Pino{''} and
Kawahara, Tatsuya and
Alam, Firoj and
Tur, Gokhan and
Johnston, Michael",
booktitle = "Proceedings of the 16th International Workshop on Spoken Dialogue System Technology",
month = feb,
year = "2026",
address = "Trento, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.iwsds-1.9/",
pages = "95--103",
abstract = "In spoken dialogue systems, even when the utterance text is the same, speaking style or tone differences can change its nuance. To respond appropriately in such cases, systems must accurately interpret paralinguistic information. Our study evaluates such a system{'}s ability using the ``paraling-dial'' dataset, which pairs a fixed utterance text with five distinct emotional expressions and their corresponding responses. We define a task using this dataset that detects mismatches{---}referred to as emotional dialogue breakdowns{---}between the expressed emotion of an utterance and the content of its response. We propose a breakdown detection system based on the Feature-wise Linear Modulation ({F}i{LM}) model, under the hypothesis that emotion dynamically controls text interpretation. Our experimental results show that the proposed model achieves 93.8{\%} accuracy with gold emotion labels and 91.2{\%} with predicted labels, demonstrating both its effectiveness and practicality. We also compare different types of control signals to identify the level of information required for such a breakdown detection task: emotion labels, emotion embeddings, and acoustic features. The results suggest that the appropriate level of abstraction, rather than simply richer information, is crucial for designing effective control signals."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nakaguro-yoshino-2026-exploring">
<titleInfo>
<title>Exploring Emotional Nuances in Spoken Dialogue: Dataset Construction and Prediction of Emotional Dialogue Breakdown</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hyuga</namePart>
<namePart type="family">Nakaguro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Koichiro</namePart>
<namePart type="family">Yoshino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-02</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th International Workshop on Spoken Dialogue System Technology</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Riccardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seyed</namePart>
<namePart type="given">Mahed</namePart>
<namePart type="family">Mousavi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Ines</namePart>
<namePart type="family">Torres</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Koichiro</namePart>
<namePart type="family">Yoshino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zoraida</namePart>
<namePart type="family">Callejas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shammur</namePart>
<namePart type="given">Absar</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frederic</namePart>
<namePart type="family">Bechet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joakim</namePart>
<namePart type="family">Gustafson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Géraldine</namePart>
<namePart type="family">Damnati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Papangelis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="given">Fernando</namePart>
<namePart type="family">D’Haro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Mendonça</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raffaella</namePart>
<namePart type="family">Bernardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dilek</namePart>
<namePart type="family">Hakkani-Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="given">”Pino”</namePart>
<namePart type="family">Di Fabbrizio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tatsuya</namePart>
<namePart type="family">Kawahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Firoj</namePart>
<namePart type="family">Alam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gokhan</namePart>
<namePart type="family">Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Johnston</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Trento, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In spoken dialogue systems, even when the utterance text is the same, speaking style or tone differences can change its nuance. To respond appropriately in such cases, systems must accurately interpret paralinguistic information. Our study evaluates such a system’s ability using the “paraling-dial” dataset, which pairs a fixed utterance text with five distinct emotional expressions and their corresponding responses. We define a task using this dataset that detects mismatches—referred to as emotional dialogue breakdowns—between the expressed emotion of an utterance and the content of its response. We propose a breakdown detection system based on the Feature-wise Linear Modulation (FiLM) model, under the hypothesis that emotion dynamically controls text interpretation. Our experimental results show that the proposed model achieves 93.8% accuracy with gold emotion labels and 91.2% with predicted labels, demonstrating both its effectiveness and practicality. We also compare different types of control signals to identify the level of information required for such a breakdown detection task: emotion labels, emotion embeddings, and acoustic features. The results suggest that the appropriate level of abstraction, rather than simply richer information, is crucial for designing effective control signals.</abstract>
<identifier type="citekey">nakaguro-yoshino-2026-exploring</identifier>
<location>
<url>https://aclanthology.org/2026.iwsds-1.9/</url>
</location>
<part>
<date>2026-02</date>
<extent unit="page">
<start>95</start>
<end>103</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Exploring Emotional Nuances in Spoken Dialogue: Dataset Construction and Prediction of Emotional Dialogue Breakdown
%A Nakaguro, Hyuga
%A Yoshino, Koichiro
%Y Riccardi, Giuseppe
%Y Mousavi, Seyed Mahed
%Y Torres, Maria Ines
%Y Yoshino, Koichiro
%Y Callejas, Zoraida
%Y Chowdhury, Shammur Absar
%Y Chen, Yun-Nung
%Y Bechet, Frederic
%Y Gustafson, Joakim
%Y Damnati, Géraldine
%Y Papangelis, Alex
%Y D’Haro, Luis Fernando
%Y Mendonça, John
%Y Bernardi, Raffaella
%Y Hakkani-Tur, Dilek
%Y Di Fabbrizio, Giuseppe ”Pino”
%Y Kawahara, Tatsuya
%Y Alam, Firoj
%Y Tur, Gokhan
%Y Johnston, Michael
%S Proceedings of the 16th International Workshop on Spoken Dialogue System Technology
%D 2026
%8 February
%I Association for Computational Linguistics
%C Trento, Italy
%F nakaguro-yoshino-2026-exploring
%X In spoken dialogue systems, even when the utterance text is the same, speaking style or tone differences can change its nuance. To respond appropriately in such cases, systems must accurately interpret paralinguistic information. Our study evaluates such a system’s ability using the “paraling-dial” dataset, which pairs a fixed utterance text with five distinct emotional expressions and their corresponding responses. We define a task using this dataset that detects mismatches—referred to as emotional dialogue breakdowns—between the expressed emotion of an utterance and the content of its response. We propose a breakdown detection system based on the Feature-wise Linear Modulation (FiLM) model, under the hypothesis that emotion dynamically controls text interpretation. Our experimental results show that the proposed model achieves 93.8% accuracy with gold emotion labels and 91.2% with predicted labels, demonstrating both its effectiveness and practicality. We also compare different types of control signals to identify the level of information required for such a breakdown detection task: emotion labels, emotion embeddings, and acoustic features. The results suggest that the appropriate level of abstraction, rather than simply richer information, is crucial for designing effective control signals.
%U https://aclanthology.org/2026.iwsds-1.9/
%P 95-103
Markdown (Informal)
[Exploring Emotional Nuances in Spoken Dialogue: Dataset Construction and Prediction of Emotional Dialogue Breakdown](https://aclanthology.org/2026.iwsds-1.9/) (Nakaguro & Yoshino, IWSDS 2026)
ACL