@inproceedings{das-2021-classifying-emotional,
title = "Classifying Emotional Utterances by Employing Multi-modal Speech Emotion Recognition",
author = "Das, Dipankar",
editor = "Biswas, Anupam and
Laskar, Rabul Hussain and
Roy, Pinki",
booktitle = "Proceedings of the Workshop on Speech and Music Processing 2021",
month = dec,
year = "2021",
address = "NIT Silchar, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2021.smp-1.1",
pages = "1--13",
abstract = "Deep learning methods are being applied to several speech processing problems in recent years. In the present work, we have explored different deep learning models for speech emotion recognition. We have employed normal deep feedforward neural network (FFNN) and convolutional neural network (CNN) to classify audio files according to their emotional content. Comparative study indicates that CNN model outperforms FFNN in case of emotions as well as gender classification. It was observed that the sole audio based models can capture the emotions up to a certain limit. Thus, we attempted a multi-modal framework by combining the benefits of the audio and text features and employed them into a recurrent encoder. Finally, the audio and text encoders are merged to provide the desired impact on various datasets. In addition, a database consists of emotional utterances of several words has also been developed as a part of this work. It contains same word in different emotional utterances. Though the size of the database is not that large but this database is ideally supposed to contain all the English words that exist in an English dictionary.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="das-2021-classifying-emotional">
<titleInfo>
<title>Classifying Emotional Utterances by Employing Multi-modal Speech Emotion Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dipankar</namePart>
<namePart type="family">Das</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Speech and Music Processing 2021</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anupam</namePart>
<namePart type="family">Biswas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rabul</namePart>
<namePart type="given">Hussain</namePart>
<namePart type="family">Laskar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pinki</namePart>
<namePart type="family">Roy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">NIT Silchar, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Deep learning methods are being applied to several speech processing problems in recent years. In the present work, we have explored different deep learning models for speech emotion recognition. We have employed normal deep feedforward neural network (FFNN) and convolutional neural network (CNN) to classify audio files according to their emotional content. Comparative study indicates that CNN model outperforms FFNN in case of emotions as well as gender classification. It was observed that the sole audio based models can capture the emotions up to a certain limit. Thus, we attempted a multi-modal framework by combining the benefits of the audio and text features and employed them into a recurrent encoder. Finally, the audio and text encoders are merged to provide the desired impact on various datasets. In addition, a database consists of emotional utterances of several words has also been developed as a part of this work. It contains same word in different emotional utterances. Though the size of the database is not that large but this database is ideally supposed to contain all the English words that exist in an English dictionary.</abstract>
<identifier type="citekey">das-2021-classifying-emotional</identifier>
<location>
<url>https://aclanthology.org/2021.smp-1.1</url>
</location>
<part>
<date>2021-12</date>
<extent unit="page">
<start>1</start>
<end>13</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Classifying Emotional Utterances by Employing Multi-modal Speech Emotion Recognition
%A Das, Dipankar
%Y Biswas, Anupam
%Y Laskar, Rabul Hussain
%Y Roy, Pinki
%S Proceedings of the Workshop on Speech and Music Processing 2021
%D 2021
%8 December
%I NLP Association of India (NLPAI)
%C NIT Silchar, India
%F das-2021-classifying-emotional
%X Deep learning methods are being applied to several speech processing problems in recent years. In the present work, we have explored different deep learning models for speech emotion recognition. We have employed normal deep feedforward neural network (FFNN) and convolutional neural network (CNN) to classify audio files according to their emotional content. Comparative study indicates that CNN model outperforms FFNN in case of emotions as well as gender classification. It was observed that the sole audio based models can capture the emotions up to a certain limit. Thus, we attempted a multi-modal framework by combining the benefits of the audio and text features and employed them into a recurrent encoder. Finally, the audio and text encoders are merged to provide the desired impact on various datasets. In addition, a database consists of emotional utterances of several words has also been developed as a part of this work. It contains same word in different emotional utterances. Though the size of the database is not that large but this database is ideally supposed to contain all the English words that exist in an English dictionary.
%U https://aclanthology.org/2021.smp-1.1
%P 1-13
Markdown (Informal)
[Classifying Emotional Utterances by Employing Multi-modal Speech Emotion Recognition](https://aclanthology.org/2021.smp-1.1) (Das, SMP 2021)
ACL