@inproceedings{jiang-etal-2025-integrating,
title = "Integrating Physiological, Speech, and Textual Information Toward Real-Time Recognition of Emotional Valence in Dialogue",
author = "Jiang, Jingjing and
Guo, Ao and
Higashinaka, Ryuichiro",
editor = "B{\'e}chet, Fr{\'e}d{\'e}ric and
Lef{\`e}vre, Fabrice and
Asher, Nicholas and
Kim, Seokhwan and
Merlin, Teva",
booktitle = "Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
month = aug,
year = "2025",
address = "Avignon, France",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sigdial-1.47/",
pages = "591--600",
abstract = "Accurately estimating users' emotional states in real time is crucial for enabling dialogue systems to respond adaptively. While existing approaches primarily rely on verbal information, such as text and speech, these modalities are often unavailable in non-speaking situations. In such cases, non-verbal information, particularly physiological signals, becomes essential for understanding users' emotional states. In this study, we aimed to develop a model for real-time recognition of users' binary emotional valence (high-valence vs. low-valence) during conversations. Specifically, we utilized an existing Japanese multimodal dialogue dataset, which includes various physiological signals, namely electrodermal activity (EDA), blood volume pulse (BVP), photoplethysmography (PPG), and pupil diameter, along with speech and textual data. We classify the emotional valence of every 15-second segment of dialogue interaction by integrating such multimodal inputs. To this end, time-series embeddings of physiological signals are extracted using a self-supervised encoder, while speech and textual features are obtained from pre-trained Japanese HuBERT and BERT models, respectively. The modality-specific embeddings are integrated using a feature fusion mechanism for emotional valence recognition. Experimental results show that while each modality individually contributes to emotion recognition, the inclusion of physiological signals leads to a notable performance improvement, particularly in non-speaking or minimally verbal situations. These findings underscore the importance of physiological information for enhancing real-time valence recognition in dialogue systems, especially when verbal information is limited."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jiang-etal-2025-integrating">
<titleInfo>
<title>Integrating Physiological, Speech, and Textual Information Toward Real-Time Recognition of Emotional Valence in Dialogue</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jingjing</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ao</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryuichiro</namePart>
<namePart type="family">Higashinaka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue</title>
</titleInfo>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabrice</namePart>
<namePart type="family">Lefèvre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Asher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Teva</namePart>
<namePart type="family">Merlin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Avignon, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Accurately estimating users’ emotional states in real time is crucial for enabling dialogue systems to respond adaptively. While existing approaches primarily rely on verbal information, such as text and speech, these modalities are often unavailable in non-speaking situations. In such cases, non-verbal information, particularly physiological signals, becomes essential for understanding users’ emotional states. In this study, we aimed to develop a model for real-time recognition of users’ binary emotional valence (high-valence vs. low-valence) during conversations. Specifically, we utilized an existing Japanese multimodal dialogue dataset, which includes various physiological signals, namely electrodermal activity (EDA), blood volume pulse (BVP), photoplethysmography (PPG), and pupil diameter, along with speech and textual data. We classify the emotional valence of every 15-second segment of dialogue interaction by integrating such multimodal inputs. To this end, time-series embeddings of physiological signals are extracted using a self-supervised encoder, while speech and textual features are obtained from pre-trained Japanese HuBERT and BERT models, respectively. The modality-specific embeddings are integrated using a feature fusion mechanism for emotional valence recognition. Experimental results show that while each modality individually contributes to emotion recognition, the inclusion of physiological signals leads to a notable performance improvement, particularly in non-speaking or minimally verbal situations. These findings underscore the importance of physiological information for enhancing real-time valence recognition in dialogue systems, especially when verbal information is limited.</abstract>
<identifier type="citekey">jiang-etal-2025-integrating</identifier>
<location>
<url>https://aclanthology.org/2025.sigdial-1.47/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>591</start>
<end>600</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Integrating Physiological, Speech, and Textual Information Toward Real-Time Recognition of Emotional Valence in Dialogue
%A Jiang, Jingjing
%A Guo, Ao
%A Higashinaka, Ryuichiro
%Y Béchet, Frédéric
%Y Lefèvre, Fabrice
%Y Asher, Nicholas
%Y Kim, Seokhwan
%Y Merlin, Teva
%S Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue
%D 2025
%8 August
%I Association for Computational Linguistics
%C Avignon, France
%F jiang-etal-2025-integrating
%X Accurately estimating users’ emotional states in real time is crucial for enabling dialogue systems to respond adaptively. While existing approaches primarily rely on verbal information, such as text and speech, these modalities are often unavailable in non-speaking situations. In such cases, non-verbal information, particularly physiological signals, becomes essential for understanding users’ emotional states. In this study, we aimed to develop a model for real-time recognition of users’ binary emotional valence (high-valence vs. low-valence) during conversations. Specifically, we utilized an existing Japanese multimodal dialogue dataset, which includes various physiological signals, namely electrodermal activity (EDA), blood volume pulse (BVP), photoplethysmography (PPG), and pupil diameter, along with speech and textual data. We classify the emotional valence of every 15-second segment of dialogue interaction by integrating such multimodal inputs. To this end, time-series embeddings of physiological signals are extracted using a self-supervised encoder, while speech and textual features are obtained from pre-trained Japanese HuBERT and BERT models, respectively. The modality-specific embeddings are integrated using a feature fusion mechanism for emotional valence recognition. Experimental results show that while each modality individually contributes to emotion recognition, the inclusion of physiological signals leads to a notable performance improvement, particularly in non-speaking or minimally verbal situations. These findings underscore the importance of physiological information for enhancing real-time valence recognition in dialogue systems, especially when verbal information is limited.
%U https://aclanthology.org/2025.sigdial-1.47/
%P 591-600
Markdown (Informal)
[Integrating Physiological, Speech, and Textual Information Toward Real-Time Recognition of Emotional Valence in Dialogue](https://aclanthology.org/2025.sigdial-1.47/) (Jiang et al., SIGDIAL 2025)
ACL