@inproceedings{zhao-etal-2022-m3ed,
title = "{M}3{ED}: Multi-modal Multi-scene Multi-label Emotional Dialogue Database",
author = "Zhao, Jinming and
Zhang, Tenggan and
Hu, Jingwen and
Liu, Yuchen and
Jin, Qin and
Wang, Xinchao and
Li, Haizhou",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.acl-long.391",
doi = "10.18653/v1/2022.acl-long.391",
pages = "5699--5710",
abstract = "The emotional state of a speaker can be influenced by many different factors in dialogues, such as dialogue scene, dialogue topic, and interlocutor stimulus. The currently available data resources to support such multimodal affective analysis in dialogues are however limited in scale and diversity. In this work, we propose a Multi-modal Multi-scene Multi-label Emotional Dialogue dataset, M$^3$ED, which contains 990 dyadic emotional dialogues from 56 different TV series, a total of 9,082 turns and 24,449 utterances. M$^3$ED is annotated with 7 emotion categories (happy, surprise, sad, disgust, anger, fear, and neutral) at utterance level, and encompasses acoustic, visual, and textual modalities. To the best of our knowledge, M$^3$ED is the first multimodal emotional dialogue dataset in Chinese.It is valuable for cross-culture emotion analysis and recognition. We apply several state-of-the-art methods on the M$^3$ED dataset to verify the validity and quality of the dataset. We also propose a general Multimodal Dialogue-aware Interaction framework, MDI, to model the dialogue context for emotion recognition, which achieves comparable performance to the state-of-the-art methods on the M$^3$ED. The full dataset and codes are available.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhao-etal-2022-m3ed">
<titleInfo>
<title>M3ED: Multi-modal Multi-scene Multi-label Emotional Dialogue Database</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jinming</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tenggan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingwen</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuchen</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qin</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinchao</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haizhou</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The emotional state of a speaker can be influenced by many different factors in dialogues, such as dialogue scene, dialogue topic, and interlocutor stimulus. The currently available data resources to support such multimodal affective analysis in dialogues are however limited in scale and diversity. In this work, we propose a Multi-modal Multi-scene Multi-label Emotional Dialogue dataset, M³ED, which contains 990 dyadic emotional dialogues from 56 different TV series, a total of 9,082 turns and 24,449 utterances. M³ED is annotated with 7 emotion categories (happy, surprise, sad, disgust, anger, fear, and neutral) at utterance level, and encompasses acoustic, visual, and textual modalities. To the best of our knowledge, M³ED is the first multimodal emotional dialogue dataset in Chinese.It is valuable for cross-culture emotion analysis and recognition. We apply several state-of-the-art methods on the M³ED dataset to verify the validity and quality of the dataset. We also propose a general Multimodal Dialogue-aware Interaction framework, MDI, to model the dialogue context for emotion recognition, which achieves comparable performance to the state-of-the-art methods on the M³ED. The full dataset and codes are available.</abstract>
<identifier type="citekey">zhao-etal-2022-m3ed</identifier>
<identifier type="doi">10.18653/v1/2022.acl-long.391</identifier>
<location>
<url>https://aclanthology.org/2022.acl-long.391</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>5699</start>
<end>5710</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T M3ED: Multi-modal Multi-scene Multi-label Emotional Dialogue Database
%A Zhao, Jinming
%A Zhang, Tenggan
%A Hu, Jingwen
%A Liu, Yuchen
%A Jin, Qin
%A Wang, Xinchao
%A Li, Haizhou
%S Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F zhao-etal-2022-m3ed
%X The emotional state of a speaker can be influenced by many different factors in dialogues, such as dialogue scene, dialogue topic, and interlocutor stimulus. The currently available data resources to support such multimodal affective analysis in dialogues are however limited in scale and diversity. In this work, we propose a Multi-modal Multi-scene Multi-label Emotional Dialogue dataset, M³ED, which contains 990 dyadic emotional dialogues from 56 different TV series, a total of 9,082 turns and 24,449 utterances. M³ED is annotated with 7 emotion categories (happy, surprise, sad, disgust, anger, fear, and neutral) at utterance level, and encompasses acoustic, visual, and textual modalities. To the best of our knowledge, M³ED is the first multimodal emotional dialogue dataset in Chinese.It is valuable for cross-culture emotion analysis and recognition. We apply several state-of-the-art methods on the M³ED dataset to verify the validity and quality of the dataset. We also propose a general Multimodal Dialogue-aware Interaction framework, MDI, to model the dialogue context for emotion recognition, which achieves comparable performance to the state-of-the-art methods on the M³ED. The full dataset and codes are available.
%R 10.18653/v1/2022.acl-long.391
%U https://aclanthology.org/2022.acl-long.391
%U https://doi.org/10.18653/v1/2022.acl-long.391
%P 5699-5710
Markdown (Informal)
[M3ED: Multi-modal Multi-scene Multi-label Emotional Dialogue Database](https://aclanthology.org/2022.acl-long.391) (Zhao et al., ACL 2022)
ACL
- Jinming Zhao, Tenggan Zhang, Jingwen Hu, Yuchen Liu, Qin Jin, Xinchao Wang, and Haizhou Li. 2022. M3ED: Multi-modal Multi-scene Multi-label Emotional Dialogue Database. In Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 5699–5710, Dublin, Ireland. Association for Computational Linguistics.