@inproceedings{kottur-etal-2021-simmc,
title = "{SIMMC} 2.0: A Task-oriented Dialog Dataset for Immersive Multimodal Conversations",
author = "Kottur, Satwik and
Moon, Seungwhan and
Geramifard, Alborz and
Damavandi, Babak",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.emnlp-main.401",
doi = "10.18653/v1/2021.emnlp-main.401",
pages = "4903--4912",
abstract = "Next generation task-oriented dialog systems need to understand conversational contexts with their perceived surroundings, to effectively help users in the real-world multimodal environment. Existing task-oriented dialog datasets aimed towards virtual assistance fall short and do not situate the dialog in the user{'}s multimodal context. To overcome, we present a new dataset for Situated and Interactive Multimodal Conversations, SIMMC 2.0, which includes 11K task-oriented user{\textless}-{\textgreater}assistant dialogs (117K utterances) in the shopping domain, grounded in immersive and photo-realistic scenes. The dialogs are collection using a two-phase pipeline: (1) A novel multimodal dialog simulator generates simulated dialog flows, with an emphasis on diversity and richness of interactions, (2) Manual paraphrasing of generating utterances to draw from natural language distribution. We provide an in-depth analysis of the collected dataset, and describe in detail the four main benchmark tasks we propose for SIMMC 2.0. Our baseline model, powered by the state-of-the-art language model, shows promising results, and highlights new challenges and directions for the community to study.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kottur-etal-2021-simmc">
<titleInfo>
<title>SIMMC 2.0: A Task-oriented Dialog Dataset for Immersive Multimodal Conversations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Satwik</namePart>
<namePart type="family">Kottur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seungwhan</namePart>
<namePart type="family">Moon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alborz</namePart>
<namePart type="family">Geramifard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Babak</namePart>
<namePart type="family">Damavandi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Scott</namePart>
<namePart type="given">Wen-tau</namePart>
<namePart type="family">Yih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online and Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Next generation task-oriented dialog systems need to understand conversational contexts with their perceived surroundings, to effectively help users in the real-world multimodal environment. Existing task-oriented dialog datasets aimed towards virtual assistance fall short and do not situate the dialog in the user’s multimodal context. To overcome, we present a new dataset for Situated and Interactive Multimodal Conversations, SIMMC 2.0, which includes 11K task-oriented user\textless-\textgreaterassistant dialogs (117K utterances) in the shopping domain, grounded in immersive and photo-realistic scenes. The dialogs are collection using a two-phase pipeline: (1) A novel multimodal dialog simulator generates simulated dialog flows, with an emphasis on diversity and richness of interactions, (2) Manual paraphrasing of generating utterances to draw from natural language distribution. We provide an in-depth analysis of the collected dataset, and describe in detail the four main benchmark tasks we propose for SIMMC 2.0. Our baseline model, powered by the state-of-the-art language model, shows promising results, and highlights new challenges and directions for the community to study.</abstract>
<identifier type="citekey">kottur-etal-2021-simmc</identifier>
<identifier type="doi">10.18653/v1/2021.emnlp-main.401</identifier>
<location>
<url>https://aclanthology.org/2021.emnlp-main.401</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>4903</start>
<end>4912</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SIMMC 2.0: A Task-oriented Dialog Dataset for Immersive Multimodal Conversations
%A Kottur, Satwik
%A Moon, Seungwhan
%A Geramifard, Alborz
%A Damavandi, Babak
%Y Moens, Marie-Francine
%Y Huang, Xuanjing
%Y Specia, Lucia
%Y Yih, Scott Wen-tau
%S Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online and Punta Cana, Dominican Republic
%F kottur-etal-2021-simmc
%X Next generation task-oriented dialog systems need to understand conversational contexts with their perceived surroundings, to effectively help users in the real-world multimodal environment. Existing task-oriented dialog datasets aimed towards virtual assistance fall short and do not situate the dialog in the user’s multimodal context. To overcome, we present a new dataset for Situated and Interactive Multimodal Conversations, SIMMC 2.0, which includes 11K task-oriented user\textless-\textgreaterassistant dialogs (117K utterances) in the shopping domain, grounded in immersive and photo-realistic scenes. The dialogs are collection using a two-phase pipeline: (1) A novel multimodal dialog simulator generates simulated dialog flows, with an emphasis on diversity and richness of interactions, (2) Manual paraphrasing of generating utterances to draw from natural language distribution. We provide an in-depth analysis of the collected dataset, and describe in detail the four main benchmark tasks we propose for SIMMC 2.0. Our baseline model, powered by the state-of-the-art language model, shows promising results, and highlights new challenges and directions for the community to study.
%R 10.18653/v1/2021.emnlp-main.401
%U https://aclanthology.org/2021.emnlp-main.401
%U https://doi.org/10.18653/v1/2021.emnlp-main.401
%P 4903-4912
Markdown (Informal)
[SIMMC 2.0: A Task-oriented Dialog Dataset for Immersive Multimodal Conversations](https://aclanthology.org/2021.emnlp-main.401) (Kottur et al., EMNLP 2021)
ACL