@inproceedings{williams-etal-2018-recognizing,
title = "Recognizing Emotions in Video Using Multimodal {DNN} Feature Fusion",
author = "Williams, Jennifer and
Kleinegesse, Steven and
Comanescu, Ramona and
Radu, Oana",
editor = "Zadeh, Amir and
Liang, Paul Pu and
Morency, Louis-Philippe and
Poria, Soujanya and
Cambria, Erik and
Scherer, Stefan",
booktitle = "Proceedings of Grand Challenge and Workshop on Human Multimodal Language (Challenge-{HML})",
month = jul,
year = "2018",
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-3302",
doi = "10.18653/v1/W18-3302",
pages = "11--19",
abstract = "We present our system description of input-level multimodal fusion of audio, video, and text for recognition of emotions and their intensities for the 2018 First Grand Challenge on Computational Modeling of Human Multimodal Language. Our proposed approach is based on input-level feature fusion with sequence learning from Bidirectional Long-Short Term Memory (BLSTM) deep neural networks (DNNs). We show that our fusion approach outperforms unimodal predictors. Our system performs 6-way simultaneous classification and regression, allowing for overlapping emotion labels in a video segment. This leads to an overall binary accuracy of 90{\%}, overall 4-class accuracy of 89.2{\%} and an overall mean-absolute-error (MAE) of 0.12. Our work shows that an early fusion technique can effectively predict the presence of multi-label emotions as well as their coarse-grained intensities. The presented multimodal approach creates a simple and robust baseline on this new Grand Challenge dataset. Furthermore, we provide a detailed analysis of emotion intensity distributions as output from our DNN, as well as a related discussion concerning the inherent difficulty of this task.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="williams-etal-2018-recognizing">
<titleInfo>
<title>Recognizing Emotions in Video Using Multimodal DNN Feature Fusion</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jennifer</namePart>
<namePart type="family">Williams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Kleinegesse</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramona</namePart>
<namePart type="family">Comanescu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oana</namePart>
<namePart type="family">Radu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Grand Challenge and Workshop on Human Multimodal Language (Challenge-HML)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Zadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="given">Pu</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Louis-Philippe</namePart>
<namePart type="family">Morency</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soujanya</namePart>
<namePart type="family">Poria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erik</namePart>
<namePart type="family">Cambria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefan</namePart>
<namePart type="family">Scherer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Melbourne, Australia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present our system description of input-level multimodal fusion of audio, video, and text for recognition of emotions and their intensities for the 2018 First Grand Challenge on Computational Modeling of Human Multimodal Language. Our proposed approach is based on input-level feature fusion with sequence learning from Bidirectional Long-Short Term Memory (BLSTM) deep neural networks (DNNs). We show that our fusion approach outperforms unimodal predictors. Our system performs 6-way simultaneous classification and regression, allowing for overlapping emotion labels in a video segment. This leads to an overall binary accuracy of 90%, overall 4-class accuracy of 89.2% and an overall mean-absolute-error (MAE) of 0.12. Our work shows that an early fusion technique can effectively predict the presence of multi-label emotions as well as their coarse-grained intensities. The presented multimodal approach creates a simple and robust baseline on this new Grand Challenge dataset. Furthermore, we provide a detailed analysis of emotion intensity distributions as output from our DNN, as well as a related discussion concerning the inherent difficulty of this task.</abstract>
<identifier type="citekey">williams-etal-2018-recognizing</identifier>
<identifier type="doi">10.18653/v1/W18-3302</identifier>
<location>
<url>https://aclanthology.org/W18-3302</url>
</location>
<part>
<date>2018-07</date>
<extent unit="page">
<start>11</start>
<end>19</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Recognizing Emotions in Video Using Multimodal DNN Feature Fusion
%A Williams, Jennifer
%A Kleinegesse, Steven
%A Comanescu, Ramona
%A Radu, Oana
%Y Zadeh, Amir
%Y Liang, Paul Pu
%Y Morency, Louis-Philippe
%Y Poria, Soujanya
%Y Cambria, Erik
%Y Scherer, Stefan
%S Proceedings of Grand Challenge and Workshop on Human Multimodal Language (Challenge-HML)
%D 2018
%8 July
%I Association for Computational Linguistics
%C Melbourne, Australia
%F williams-etal-2018-recognizing
%X We present our system description of input-level multimodal fusion of audio, video, and text for recognition of emotions and their intensities for the 2018 First Grand Challenge on Computational Modeling of Human Multimodal Language. Our proposed approach is based on input-level feature fusion with sequence learning from Bidirectional Long-Short Term Memory (BLSTM) deep neural networks (DNNs). We show that our fusion approach outperforms unimodal predictors. Our system performs 6-way simultaneous classification and regression, allowing for overlapping emotion labels in a video segment. This leads to an overall binary accuracy of 90%, overall 4-class accuracy of 89.2% and an overall mean-absolute-error (MAE) of 0.12. Our work shows that an early fusion technique can effectively predict the presence of multi-label emotions as well as their coarse-grained intensities. The presented multimodal approach creates a simple and robust baseline on this new Grand Challenge dataset. Furthermore, we provide a detailed analysis of emotion intensity distributions as output from our DNN, as well as a related discussion concerning the inherent difficulty of this task.
%R 10.18653/v1/W18-3302
%U https://aclanthology.org/W18-3302
%U https://doi.org/10.18653/v1/W18-3302
%P 11-19
Markdown (Informal)
[Recognizing Emotions in Video Using Multimodal DNN Feature Fusion](https://aclanthology.org/W18-3302) (Williams et al., ACL 2018)
ACL