@inproceedings{sun-etal-2021-new,
title = "A New View of Multi-modal Language Analysis: Audio and Video Features as Text {``}Styles{''}",
author = "Sun, Zhongkai and
Sarma, Prathusha K and
Liang, Yingyu and
Sethares, William",
editor = "Merlo, Paola and
Tiedemann, Jorg and
Tsarfaty, Reut",
booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume",
month = apr,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eacl-main.167",
doi = "10.18653/v1/2021.eacl-main.167",
pages = "1956--1965",
abstract = "Imposing the style of one image onto another is called style transfer. For example, the style of a Van Gogh painting might be imposed on a photograph to yield an interesting hybrid. This paper applies the adaptive normalization used for image style transfer to language semantics, i.e., the style is the way the words are said (tone of voice and facial expressions) and these are style-transferred onto the text. The goal is to learn richer representations for multi-modal utterances using style-transferred multi-modal features. The proposed Style-Transfer Transformer (STT) grafts a stepped styled adaptive layer-normalization onto a transformer network, the output from which is used in sentiment analysis and emotion recognition problems. In addition to achieving performance on par with the state-of-the art (but using less than a third of the model parameters), we examine the relative contributions of each mode when used in the downstream applications.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sun-etal-2021-new">
<titleInfo>
<title>A New View of Multi-modal Language Analysis: Audio and Video Features as Text “Styles”</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhongkai</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prathusha</namePart>
<namePart type="given">K</namePart>
<namePart type="family">Sarma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yingyu</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="family">Sethares</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume</title>
</titleInfo>
<name type="personal">
<namePart type="given">Paola</namePart>
<namePart type="family">Merlo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reut</namePart>
<namePart type="family">Tsarfaty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Imposing the style of one image onto another is called style transfer. For example, the style of a Van Gogh painting might be imposed on a photograph to yield an interesting hybrid. This paper applies the adaptive normalization used for image style transfer to language semantics, i.e., the style is the way the words are said (tone of voice and facial expressions) and these are style-transferred onto the text. The goal is to learn richer representations for multi-modal utterances using style-transferred multi-modal features. The proposed Style-Transfer Transformer (STT) grafts a stepped styled adaptive layer-normalization onto a transformer network, the output from which is used in sentiment analysis and emotion recognition problems. In addition to achieving performance on par with the state-of-the art (but using less than a third of the model parameters), we examine the relative contributions of each mode when used in the downstream applications.</abstract>
<identifier type="citekey">sun-etal-2021-new</identifier>
<identifier type="doi">10.18653/v1/2021.eacl-main.167</identifier>
<location>
<url>https://aclanthology.org/2021.eacl-main.167</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>1956</start>
<end>1965</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A New View of Multi-modal Language Analysis: Audio and Video Features as Text “Styles”
%A Sun, Zhongkai
%A Sarma, Prathusha K.
%A Liang, Yingyu
%A Sethares, William
%Y Merlo, Paola
%Y Tiedemann, Jorg
%Y Tsarfaty, Reut
%S Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume
%D 2021
%8 April
%I Association for Computational Linguistics
%C Online
%F sun-etal-2021-new
%X Imposing the style of one image onto another is called style transfer. For example, the style of a Van Gogh painting might be imposed on a photograph to yield an interesting hybrid. This paper applies the adaptive normalization used for image style transfer to language semantics, i.e., the style is the way the words are said (tone of voice and facial expressions) and these are style-transferred onto the text. The goal is to learn richer representations for multi-modal utterances using style-transferred multi-modal features. The proposed Style-Transfer Transformer (STT) grafts a stepped styled adaptive layer-normalization onto a transformer network, the output from which is used in sentiment analysis and emotion recognition problems. In addition to achieving performance on par with the state-of-the art (but using less than a third of the model parameters), we examine the relative contributions of each mode when used in the downstream applications.
%R 10.18653/v1/2021.eacl-main.167
%U https://aclanthology.org/2021.eacl-main.167
%U https://doi.org/10.18653/v1/2021.eacl-main.167
%P 1956-1965
Markdown (Informal)
[A New View of Multi-modal Language Analysis: Audio and Video Features as Text “Styles”](https://aclanthology.org/2021.eacl-main.167) (Sun et al., EACL 2021)
ACL