@inproceedings{sundar-heck-2022-multimodal,
title = "Multimodal Conversational {AI}: A Survey of Datasets and Approaches",
author = "Sundar, Anirudh and
Heck, Larry",
editor = "Liu, Bing and
Papangelis, Alexandros and
Ultes, Stefan and
Rastogi, Abhinav and
Chen, Yun-Nung and
Spithourakis, Georgios and
Nouri, Elnaz and
Shi, Weiyan",
booktitle = "Proceedings of the 4th Workshop on NLP for Conversational AI",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.nlp4convai-1.12",
doi = "10.18653/v1/2022.nlp4convai-1.12",
pages = "131--147",
abstract = "As humans, we experience the world with all our senses or modalities (sound, sight, touch, smell, and taste). We use these modalities, particularly sight and touch, to convey and interpret specific meanings. Multimodal expressions are central to conversations; a rich set of modalities amplify and often compensate for each other. A multimodal conversational AI system answers questions, fulfills tasks, and emulates human conversations by understanding and expressing itself via multiple modalities. This paper motivates, defines, and mathematically formulates the multimodal conversational research objective. We provide a taxonomy of research required to solve the objective: multimodal representation, fusion, alignment, translation, and co-learning. We survey state-of-the-art datasets and approaches for each research area and highlight their limiting assumptions. Finally, we identify multimodal co-learning as a promising direction for multimodal conversational AI research.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sundar-heck-2022-multimodal">
<titleInfo>
<title>Multimodal Conversational AI: A Survey of Datasets and Approaches</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anirudh</namePart>
<namePart type="family">Sundar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larry</namePart>
<namePart type="family">Heck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on NLP for Conversational AI</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bing</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandros</namePart>
<namePart type="family">Papangelis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefan</namePart>
<namePart type="family">Ultes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhinav</namePart>
<namePart type="family">Rastogi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georgios</namePart>
<namePart type="family">Spithourakis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elnaz</namePart>
<namePart type="family">Nouri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weiyan</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>As humans, we experience the world with all our senses or modalities (sound, sight, touch, smell, and taste). We use these modalities, particularly sight and touch, to convey and interpret specific meanings. Multimodal expressions are central to conversations; a rich set of modalities amplify and often compensate for each other. A multimodal conversational AI system answers questions, fulfills tasks, and emulates human conversations by understanding and expressing itself via multiple modalities. This paper motivates, defines, and mathematically formulates the multimodal conversational research objective. We provide a taxonomy of research required to solve the objective: multimodal representation, fusion, alignment, translation, and co-learning. We survey state-of-the-art datasets and approaches for each research area and highlight their limiting assumptions. Finally, we identify multimodal co-learning as a promising direction for multimodal conversational AI research.</abstract>
<identifier type="citekey">sundar-heck-2022-multimodal</identifier>
<identifier type="doi">10.18653/v1/2022.nlp4convai-1.12</identifier>
<location>
<url>https://aclanthology.org/2022.nlp4convai-1.12</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>131</start>
<end>147</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multimodal Conversational AI: A Survey of Datasets and Approaches
%A Sundar, Anirudh
%A Heck, Larry
%Y Liu, Bing
%Y Papangelis, Alexandros
%Y Ultes, Stefan
%Y Rastogi, Abhinav
%Y Chen, Yun-Nung
%Y Spithourakis, Georgios
%Y Nouri, Elnaz
%Y Shi, Weiyan
%S Proceedings of the 4th Workshop on NLP for Conversational AI
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F sundar-heck-2022-multimodal
%X As humans, we experience the world with all our senses or modalities (sound, sight, touch, smell, and taste). We use these modalities, particularly sight and touch, to convey and interpret specific meanings. Multimodal expressions are central to conversations; a rich set of modalities amplify and often compensate for each other. A multimodal conversational AI system answers questions, fulfills tasks, and emulates human conversations by understanding and expressing itself via multiple modalities. This paper motivates, defines, and mathematically formulates the multimodal conversational research objective. We provide a taxonomy of research required to solve the objective: multimodal representation, fusion, alignment, translation, and co-learning. We survey state-of-the-art datasets and approaches for each research area and highlight their limiting assumptions. Finally, we identify multimodal co-learning as a promising direction for multimodal conversational AI research.
%R 10.18653/v1/2022.nlp4convai-1.12
%U https://aclanthology.org/2022.nlp4convai-1.12
%U https://doi.org/10.18653/v1/2022.nlp4convai-1.12
%P 131-147
Markdown (Informal)
[Multimodal Conversational AI: A Survey of Datasets and Approaches](https://aclanthology.org/2022.nlp4convai-1.12) (Sundar & Heck, NLP4ConvAI 2022)
ACL