@inproceedings{sato-etal-2025-key,
title = "Key Challenges in Multimodal Task-Oriented Dialogue Systems: Insights from a Large Competition-Based Dataset",
author = "Sato, Shiki and
Iwata, Shinji and
Hentona, Asahi and
Sasaki, Yuta and
Yamazaki, Takato and
Moriya, Shoji and
Ohagi, Masaya and
Kikuchi, Hirofumi and
Yang, Jie and
Qi, Zhiyang and
Kodama, Takashi and
Lee, Akinobu and
Komuro, Masato and
Nishikawa, Hiroyuki and
Makino, Ryosaku and
Minato, Takashi and
Sakai, Kurima and
Funayama, Tomo and
Funakoshi, Kotaro and
Usami, Mayumi and
Inaba, Michimasa and
Takahashi, Tetsuro and
Higashinaka, Ryuichiro",
editor = "B{\'e}chet, Fr{\'e}d{\'e}ric and
Lef{\`e}vre, Fabrice and
Asher, Nicholas and
Kim, Seokhwan and
Merlin, Teva",
booktitle = "Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
month = aug,
year = "2025",
address = "Avignon, France",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sigdial-1.36/",
pages = "449--460",
abstract = "Challenges in multimodal task-oriented dialogue between humans and systems, particularly those involving audio and visual interactions, have not been sufficiently explored or shared, forcing researchers to define improvement directions individually without a clearly shared roadmap. To address these challenges, we organized a competition for multimodal task-oriented dialogue systems and constructed a large competition-based dataset of 1,865 minutes of Japanese task-oriented dialogues. This dataset includes audio and visual interactions between diverse systems and human participants. After analyzing system behaviors identified as problematic by the human participants in questionnaire surveys and notable methods employed by the participating teams, we identified key challenges in multimodal task-oriented dialogue systems and discussed potential directions for overcoming these challenges."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sato-etal-2025-key">
<titleInfo>
<title>Key Challenges in Multimodal Task-Oriented Dialogue Systems: Insights from a Large Competition-Based Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shiki</namePart>
<namePart type="family">Sato</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shinji</namePart>
<namePart type="family">Iwata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asahi</namePart>
<namePart type="family">Hentona</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuta</namePart>
<namePart type="family">Sasaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Takato</namePart>
<namePart type="family">Yamazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shoji</namePart>
<namePart type="family">Moriya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Masaya</namePart>
<namePart type="family">Ohagi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hirofumi</namePart>
<namePart type="family">Kikuchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jie</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyang</namePart>
<namePart type="family">Qi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Takashi</namePart>
<namePart type="family">Kodama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akinobu</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Masato</namePart>
<namePart type="family">Komuro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroyuki</namePart>
<namePart type="family">Nishikawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryosaku</namePart>
<namePart type="family">Makino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Takashi</namePart>
<namePart type="family">Minato</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kurima</namePart>
<namePart type="family">Sakai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tomo</namePart>
<namePart type="family">Funayama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kotaro</namePart>
<namePart type="family">Funakoshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mayumi</namePart>
<namePart type="family">Usami</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michimasa</namePart>
<namePart type="family">Inaba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tetsuro</namePart>
<namePart type="family">Takahashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryuichiro</namePart>
<namePart type="family">Higashinaka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue</title>
</titleInfo>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabrice</namePart>
<namePart type="family">Lefèvre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Asher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Teva</namePart>
<namePart type="family">Merlin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Avignon, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Challenges in multimodal task-oriented dialogue between humans and systems, particularly those involving audio and visual interactions, have not been sufficiently explored or shared, forcing researchers to define improvement directions individually without a clearly shared roadmap. To address these challenges, we organized a competition for multimodal task-oriented dialogue systems and constructed a large competition-based dataset of 1,865 minutes of Japanese task-oriented dialogues. This dataset includes audio and visual interactions between diverse systems and human participants. After analyzing system behaviors identified as problematic by the human participants in questionnaire surveys and notable methods employed by the participating teams, we identified key challenges in multimodal task-oriented dialogue systems and discussed potential directions for overcoming these challenges.</abstract>
<identifier type="citekey">sato-etal-2025-key</identifier>
<location>
<url>https://aclanthology.org/2025.sigdial-1.36/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>449</start>
<end>460</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Key Challenges in Multimodal Task-Oriented Dialogue Systems: Insights from a Large Competition-Based Dataset
%A Sato, Shiki
%A Iwata, Shinji
%A Hentona, Asahi
%A Sasaki, Yuta
%A Yamazaki, Takato
%A Moriya, Shoji
%A Ohagi, Masaya
%A Kikuchi, Hirofumi
%A Yang, Jie
%A Qi, Zhiyang
%A Kodama, Takashi
%A Lee, Akinobu
%A Komuro, Masato
%A Nishikawa, Hiroyuki
%A Makino, Ryosaku
%A Minato, Takashi
%A Sakai, Kurima
%A Funayama, Tomo
%A Funakoshi, Kotaro
%A Usami, Mayumi
%A Inaba, Michimasa
%A Takahashi, Tetsuro
%A Higashinaka, Ryuichiro
%Y Béchet, Frédéric
%Y Lefèvre, Fabrice
%Y Asher, Nicholas
%Y Kim, Seokhwan
%Y Merlin, Teva
%S Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue
%D 2025
%8 August
%I Association for Computational Linguistics
%C Avignon, France
%F sato-etal-2025-key
%X Challenges in multimodal task-oriented dialogue between humans and systems, particularly those involving audio and visual interactions, have not been sufficiently explored or shared, forcing researchers to define improvement directions individually without a clearly shared roadmap. To address these challenges, we organized a competition for multimodal task-oriented dialogue systems and constructed a large competition-based dataset of 1,865 minutes of Japanese task-oriented dialogues. This dataset includes audio and visual interactions between diverse systems and human participants. After analyzing system behaviors identified as problematic by the human participants in questionnaire surveys and notable methods employed by the participating teams, we identified key challenges in multimodal task-oriented dialogue systems and discussed potential directions for overcoming these challenges.
%U https://aclanthology.org/2025.sigdial-1.36/
%P 449-460
Markdown (Informal)
[Key Challenges in Multimodal Task-Oriented Dialogue Systems: Insights from a Large Competition-Based Dataset](https://aclanthology.org/2025.sigdial-1.36/) (Sato et al., SIGDIAL 2025)
ACL
- Shiki Sato, Shinji Iwata, Asahi Hentona, Yuta Sasaki, Takato Yamazaki, Shoji Moriya, Masaya Ohagi, Hirofumi Kikuchi, Jie Yang, Zhiyang Qi, Takashi Kodama, Akinobu Lee, Masato Komuro, Hiroyuki Nishikawa, Ryosaku Makino, Takashi Minato, Kurima Sakai, Tomo Funayama, Kotaro Funakoshi, Mayumi Usami, Michimasa Inaba, Tetsuro Takahashi, and Ryuichiro Higashinaka. 2025. Key Challenges in Multimodal Task-Oriented Dialogue Systems: Insights from a Large Competition-Based Dataset. In Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue, pages 449–460, Avignon, France. Association for Computational Linguistics.