@inproceedings{blanchard-etal-2018-getting,
title = "Getting the subtext without the text: Scalable multimodal sentiment classification from visual and acoustic modalities",
author = "Blanchard, Nathaniel and
Moreira, Daniel and
Bharati, Aparna and
Scheirer, Walter",
editor = "Zadeh, Amir and
Liang, Paul Pu and
Morency, Louis-Philippe and
Poria, Soujanya and
Cambria, Erik and
Scherer, Stefan",
booktitle = "Proceedings of Grand Challenge and Workshop on Human Multimodal Language (Challenge-{HML})",
month = jul,
year = "2018",
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-3301",
doi = "10.18653/v1/W18-3301",
pages = "1--10",
abstract = "In the last decade, video blogs (vlogs) have become an extremely popular method through which people express sentiment. The ubiquitousness of these videos has increased the importance of multimodal fusion models, which incorporate video and audio features with traditional text features for automatic sentiment detection. Multimodal fusion offers a unique opportunity to build models that learn from the full depth of expression available to human viewers. In the detection of sentiment in these videos, acoustic and video features provide clarity to otherwise ambiguous transcripts. In this paper, we present a multimodal fusion model that exclusively uses high-level video and audio features to analyze spoken sentences for sentiment. We discard traditional transcription features in order to minimize human intervention and to maximize the deployability of our model on at-scale real-world data. We select high-level features for our model that have been successful in non-affect domains in order to test their generalizability in the sentiment detection domain. We train and test our model on the newly released CMU Multimodal Opinion Sentiment and Emotion Intensity (CMU-MOSEI) dataset, obtaining an F1 score of 0.8049 on the validation set and an F1 score of 0.6325 on the held-out challenge test set.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="blanchard-etal-2018-getting">
<titleInfo>
<title>Getting the subtext without the text: Scalable multimodal sentiment classification from visual and acoustic modalities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nathaniel</namePart>
<namePart type="family">Blanchard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aparna</namePart>
<namePart type="family">Bharati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Walter</namePart>
<namePart type="family">Scheirer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Grand Challenge and Workshop on Human Multimodal Language (Challenge-HML)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Zadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="given">Pu</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Louis-Philippe</namePart>
<namePart type="family">Morency</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soujanya</namePart>
<namePart type="family">Poria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erik</namePart>
<namePart type="family">Cambria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefan</namePart>
<namePart type="family">Scherer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Melbourne, Australia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In the last decade, video blogs (vlogs) have become an extremely popular method through which people express sentiment. The ubiquitousness of these videos has increased the importance of multimodal fusion models, which incorporate video and audio features with traditional text features for automatic sentiment detection. Multimodal fusion offers a unique opportunity to build models that learn from the full depth of expression available to human viewers. In the detection of sentiment in these videos, acoustic and video features provide clarity to otherwise ambiguous transcripts. In this paper, we present a multimodal fusion model that exclusively uses high-level video and audio features to analyze spoken sentences for sentiment. We discard traditional transcription features in order to minimize human intervention and to maximize the deployability of our model on at-scale real-world data. We select high-level features for our model that have been successful in non-affect domains in order to test their generalizability in the sentiment detection domain. We train and test our model on the newly released CMU Multimodal Opinion Sentiment and Emotion Intensity (CMU-MOSEI) dataset, obtaining an F1 score of 0.8049 on the validation set and an F1 score of 0.6325 on the held-out challenge test set.</abstract>
<identifier type="citekey">blanchard-etal-2018-getting</identifier>
<identifier type="doi">10.18653/v1/W18-3301</identifier>
<location>
<url>https://aclanthology.org/W18-3301</url>
</location>
<part>
<date>2018-07</date>
<extent unit="page">
<start>1</start>
<end>10</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Getting the subtext without the text: Scalable multimodal sentiment classification from visual and acoustic modalities
%A Blanchard, Nathaniel
%A Moreira, Daniel
%A Bharati, Aparna
%A Scheirer, Walter
%Y Zadeh, Amir
%Y Liang, Paul Pu
%Y Morency, Louis-Philippe
%Y Poria, Soujanya
%Y Cambria, Erik
%Y Scherer, Stefan
%S Proceedings of Grand Challenge and Workshop on Human Multimodal Language (Challenge-HML)
%D 2018
%8 July
%I Association for Computational Linguistics
%C Melbourne, Australia
%F blanchard-etal-2018-getting
%X In the last decade, video blogs (vlogs) have become an extremely popular method through which people express sentiment. The ubiquitousness of these videos has increased the importance of multimodal fusion models, which incorporate video and audio features with traditional text features for automatic sentiment detection. Multimodal fusion offers a unique opportunity to build models that learn from the full depth of expression available to human viewers. In the detection of sentiment in these videos, acoustic and video features provide clarity to otherwise ambiguous transcripts. In this paper, we present a multimodal fusion model that exclusively uses high-level video and audio features to analyze spoken sentences for sentiment. We discard traditional transcription features in order to minimize human intervention and to maximize the deployability of our model on at-scale real-world data. We select high-level features for our model that have been successful in non-affect domains in order to test their generalizability in the sentiment detection domain. We train and test our model on the newly released CMU Multimodal Opinion Sentiment and Emotion Intensity (CMU-MOSEI) dataset, obtaining an F1 score of 0.8049 on the validation set and an F1 score of 0.6325 on the held-out challenge test set.
%R 10.18653/v1/W18-3301
%U https://aclanthology.org/W18-3301
%U https://doi.org/10.18653/v1/W18-3301
%P 1-10
Markdown (Informal)
[Getting the subtext without the text: Scalable multimodal sentiment classification from visual and acoustic modalities](https://aclanthology.org/W18-3301) (Blanchard et al., ACL 2018)
ACL