@inproceedings{jiang-2025-towards,
title = "Towards Human-Like Dialogue Systems: Integrating Multimodal Emotion Recognition and Non-Verbal Cue Generation",
author = "Jiang, Jingjing",
editor = "Whetten, Ryan and
Sucal, Virgile and
Ngo, Anh and
Chalamalasetti, Kranti and
Inoue, Koji and
Cimino, Gaetano and
Yang, Zachary and
Zenimoto, Yuki and
Rodriguez, Ricardo",
booktitle = "Proceedings of the 21st Workshop of Young Researchers' Roundtable on Spoken Dialogue Systems",
month = aug,
year = "2025",
address = "Avignon, France",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.yrrsds-1.6/",
pages = "15--17",
abstract = "This position paper outlines my research vision for developing human-like dialogue systems capable of both perceiving and expressing emotions through multimodal communication. My current research focuses on two main areas: multimodal emotion recognition and non-verbal cue generation. For emotion recognition, I constructed a Japanese multimodal dialogue dataset that captures natural, dyadic face-to-face interactions and developed an emotional valence recognition model that integrates textual, speech and physiological inputs. On the generation side, my research explores non-verbal cue generation for embodied conversational agents (ECAs). Finally, the paper discusses the future of SDSs, emphasizing the shift from traditional turn-based architectures to full-duplex, real-time, multimodal systems."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jiang-2025-towards">
<titleInfo>
<title>Towards Human-Like Dialogue Systems: Integrating Multimodal Emotion Recognition and Non-Verbal Cue Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jingjing</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st Workshop of Young Researchers’ Roundtable on Spoken Dialogue Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Whetten</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Virgile</namePart>
<namePart type="family">Sucal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anh</namePart>
<namePart type="family">Ngo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kranti</namePart>
<namePart type="family">Chalamalasetti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Koji</namePart>
<namePart type="family">Inoue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gaetano</namePart>
<namePart type="family">Cimino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zachary</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuki</namePart>
<namePart type="family">Zenimoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ricardo</namePart>
<namePart type="family">Rodriguez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Avignon, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This position paper outlines my research vision for developing human-like dialogue systems capable of both perceiving and expressing emotions through multimodal communication. My current research focuses on two main areas: multimodal emotion recognition and non-verbal cue generation. For emotion recognition, I constructed a Japanese multimodal dialogue dataset that captures natural, dyadic face-to-face interactions and developed an emotional valence recognition model that integrates textual, speech and physiological inputs. On the generation side, my research explores non-verbal cue generation for embodied conversational agents (ECAs). Finally, the paper discusses the future of SDSs, emphasizing the shift from traditional turn-based architectures to full-duplex, real-time, multimodal systems.</abstract>
<identifier type="citekey">jiang-2025-towards</identifier>
<location>
<url>https://aclanthology.org/2025.yrrsds-1.6/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>15</start>
<end>17</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Human-Like Dialogue Systems: Integrating Multimodal Emotion Recognition and Non-Verbal Cue Generation
%A Jiang, Jingjing
%Y Whetten, Ryan
%Y Sucal, Virgile
%Y Ngo, Anh
%Y Chalamalasetti, Kranti
%Y Inoue, Koji
%Y Cimino, Gaetano
%Y Yang, Zachary
%Y Zenimoto, Yuki
%Y Rodriguez, Ricardo
%S Proceedings of the 21st Workshop of Young Researchers’ Roundtable on Spoken Dialogue Systems
%D 2025
%8 August
%I Association for Computational Linguistics
%C Avignon, France
%F jiang-2025-towards
%X This position paper outlines my research vision for developing human-like dialogue systems capable of both perceiving and expressing emotions through multimodal communication. My current research focuses on two main areas: multimodal emotion recognition and non-verbal cue generation. For emotion recognition, I constructed a Japanese multimodal dialogue dataset that captures natural, dyadic face-to-face interactions and developed an emotional valence recognition model that integrates textual, speech and physiological inputs. On the generation side, my research explores non-verbal cue generation for embodied conversational agents (ECAs). Finally, the paper discusses the future of SDSs, emphasizing the shift from traditional turn-based architectures to full-duplex, real-time, multimodal systems.
%U https://aclanthology.org/2025.yrrsds-1.6/
%P 15-17
Markdown (Informal)
[Towards Human-Like Dialogue Systems: Integrating Multimodal Emotion Recognition and Non-Verbal Cue Generation](https://aclanthology.org/2025.yrrsds-1.6/) (Jiang, YRRSDS 2025)
ACL