@inproceedings{qiu-etal-2023-topic,
title = "Topic and Style-aware Transformer for Multimodal Emotion Recognition",
author = "Qiu, Shuwen and
Sekhar, Nitesh and
Singhal, Prateek",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.130",
doi = "10.18653/v1/2023.findings-acl.130",
pages = "2074--2082",
abstract = "Understanding emotion expressions in multimodal signals is key for machines to have a better understanding of human communication. While language, visual and acoustic modalities can provide clues from different perspectives, the visual modality is shown to make minimal contribution to the performance in the emotion recognition field due to its high dimensionality. Therefore, we first leverage the strong multimodality backbone VATT to project the visual signal to the common space with language and acoustic signals. Also, we propose content-oriented features Topic and Speaking style on top of it to approach the subjectivity issues. Experiments conducted on the benchmark dataset MOSEI show our model can outperform SOTA results and effectively incorporate visual signals and handle subjectivity issues by serving as content {``}normalization{''}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="qiu-etal-2023-topic">
<titleInfo>
<title>Topic and Style-aware Transformer for Multimodal Emotion Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shuwen</namePart>
<namePart type="family">Qiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nitesh</namePart>
<namePart type="family">Sekhar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prateek</namePart>
<namePart type="family">Singhal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Understanding emotion expressions in multimodal signals is key for machines to have a better understanding of human communication. While language, visual and acoustic modalities can provide clues from different perspectives, the visual modality is shown to make minimal contribution to the performance in the emotion recognition field due to its high dimensionality. Therefore, we first leverage the strong multimodality backbone VATT to project the visual signal to the common space with language and acoustic signals. Also, we propose content-oriented features Topic and Speaking style on top of it to approach the subjectivity issues. Experiments conducted on the benchmark dataset MOSEI show our model can outperform SOTA results and effectively incorporate visual signals and handle subjectivity issues by serving as content “normalization”.</abstract>
<identifier type="citekey">qiu-etal-2023-topic</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.130</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.130</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>2074</start>
<end>2082</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Topic and Style-aware Transformer for Multimodal Emotion Recognition
%A Qiu, Shuwen
%A Sekhar, Nitesh
%A Singhal, Prateek
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F qiu-etal-2023-topic
%X Understanding emotion expressions in multimodal signals is key for machines to have a better understanding of human communication. While language, visual and acoustic modalities can provide clues from different perspectives, the visual modality is shown to make minimal contribution to the performance in the emotion recognition field due to its high dimensionality. Therefore, we first leverage the strong multimodality backbone VATT to project the visual signal to the common space with language and acoustic signals. Also, we propose content-oriented features Topic and Speaking style on top of it to approach the subjectivity issues. Experiments conducted on the benchmark dataset MOSEI show our model can outperform SOTA results and effectively incorporate visual signals and handle subjectivity issues by serving as content “normalization”.
%R 10.18653/v1/2023.findings-acl.130
%U https://aclanthology.org/2023.findings-acl.130
%U https://doi.org/10.18653/v1/2023.findings-acl.130
%P 2074-2082
Markdown (Informal)
[Topic and Style-aware Transformer for Multimodal Emotion Recognition](https://aclanthology.org/2023.findings-acl.130) (Qiu et al., Findings 2023)
ACL