@inproceedings{tits-etal-2018-asr,
title = "{ASR}-based Features for Emotion Recognition: A Transfer Learning Approach",
author = "Tits, No{\'e} and
El Haddad, Kevin and
Dutoit, Thierry",
editor = "Zadeh, Amir and
Liang, Paul Pu and
Morency, Louis-Philippe and
Poria, Soujanya and
Cambria, Erik and
Scherer, Stefan",
booktitle = "Proceedings of Grand Challenge and Workshop on Human Multimodal Language (Challenge-{HML})",
month = jul,
year = "2018",
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-3307",
doi = "10.18653/v1/W18-3307",
pages = "48--52",
abstract = "During the last decade, the applications of signal processing have drastically improved with deep learning. However areas of affecting computing such as emotional speech synthesis or emotion recognition from spoken language remains challenging. In this paper, we investigate the use of a neural Automatic Speech Recognition (ASR) as a feature extractor for emotion recognition. We show that these features outperform the eGeMAPS feature set to predict the valence and arousal emotional dimensions, which means that the audio-to-text mapping learned by the ASR system contains information related to the emotional dimensions in spontaneous speech. We also examine the relationship between first layers (closer to speech) and last layers (closer to text) of the ASR and valence/arousal.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tits-etal-2018-asr">
<titleInfo>
<title>ASR-based Features for Emotion Recognition: A Transfer Learning Approach</title>
</titleInfo>
<name type="personal">
<namePart type="given">Noé</namePart>
<namePart type="family">Tits</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">El Haddad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Dutoit</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Grand Challenge and Workshop on Human Multimodal Language (Challenge-HML)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Zadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="given">Pu</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Louis-Philippe</namePart>
<namePart type="family">Morency</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soujanya</namePart>
<namePart type="family">Poria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erik</namePart>
<namePart type="family">Cambria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefan</namePart>
<namePart type="family">Scherer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Melbourne, Australia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>During the last decade, the applications of signal processing have drastically improved with deep learning. However areas of affecting computing such as emotional speech synthesis or emotion recognition from spoken language remains challenging. In this paper, we investigate the use of a neural Automatic Speech Recognition (ASR) as a feature extractor for emotion recognition. We show that these features outperform the eGeMAPS feature set to predict the valence and arousal emotional dimensions, which means that the audio-to-text mapping learned by the ASR system contains information related to the emotional dimensions in spontaneous speech. We also examine the relationship between first layers (closer to speech) and last layers (closer to text) of the ASR and valence/arousal.</abstract>
<identifier type="citekey">tits-etal-2018-asr</identifier>
<identifier type="doi">10.18653/v1/W18-3307</identifier>
<location>
<url>https://aclanthology.org/W18-3307</url>
</location>
<part>
<date>2018-07</date>
<extent unit="page">
<start>48</start>
<end>52</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ASR-based Features for Emotion Recognition: A Transfer Learning Approach
%A Tits, Noé
%A El Haddad, Kevin
%A Dutoit, Thierry
%Y Zadeh, Amir
%Y Liang, Paul Pu
%Y Morency, Louis-Philippe
%Y Poria, Soujanya
%Y Cambria, Erik
%Y Scherer, Stefan
%S Proceedings of Grand Challenge and Workshop on Human Multimodal Language (Challenge-HML)
%D 2018
%8 July
%I Association for Computational Linguistics
%C Melbourne, Australia
%F tits-etal-2018-asr
%X During the last decade, the applications of signal processing have drastically improved with deep learning. However areas of affecting computing such as emotional speech synthesis or emotion recognition from spoken language remains challenging. In this paper, we investigate the use of a neural Automatic Speech Recognition (ASR) as a feature extractor for emotion recognition. We show that these features outperform the eGeMAPS feature set to predict the valence and arousal emotional dimensions, which means that the audio-to-text mapping learned by the ASR system contains information related to the emotional dimensions in spontaneous speech. We also examine the relationship between first layers (closer to speech) and last layers (closer to text) of the ASR and valence/arousal.
%R 10.18653/v1/W18-3307
%U https://aclanthology.org/W18-3307
%U https://doi.org/10.18653/v1/W18-3307
%P 48-52
Markdown (Informal)
[ASR-based Features for Emotion Recognition: A Transfer Learning Approach](https://aclanthology.org/W18-3307) (Tits et al., ACL 2018)
ACL