@inproceedings{dasaro-etal-2024-using,
title = "Using Large Speech Models for Feature Extraction in Cross-Lingual Speech Emotion Recognition",
author = "D{'}asaro, Federico and
M{\'a}rquez Villac{\'i}s, Juan Jos{\'e} and
Rizzo, Giuseppe and
Bottino, Andrea",
editor = "Dell'Orletta, Felice and
Lenci, Alessandro and
Montemagni, Simonetta and
Sprugnoli, Rachele",
booktitle = "Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)",
month = dec,
year = "2024",
address = "Pisa, Italy",
publisher = "CEUR Workshop Proceedings",
url = "https://aclanthology.org/2024.clicit-1.31/",
pages = "258--265",
ISBN = "979-12-210-7060-6",
abstract = "Large Speech Models (LSMs), pre-trained on extensive unlabeled data using Self-Supervised Learning (SSL) or WeaklySupervised Learning (WSL), are increasingly employed for tasks like Speech Emotion Recognition (SER). Their capability to extract general-purpose features makes them a strong alternative to low-level descriptors. Most studies focus on English, with limited research on other languages. We evaluate English-Only and Multilingual LSMs from the Wav2Vec 2.0 and Whisper families as feature extractors for SER in eight languages. We have stacked three alternative downstream classifiers of increasing complexity, named Linear, Non-Linear, and Multi-Layer, on top of the LSMs. Results indicate that Whisper models perform best with a simple linear classifier using features from the last transformer layer, while Wav2Vec 2.0 models benefit from features from the middle and early transformer layers. When comparing English-Only and Multilingual LSMs, we find that Whisper models benefit from multilingual pre-training, excelling in Italian, Canadian French, French, Spanish, German and competitively on Greek, Egyptian Arabic, Persian. In contrast, English-Only Wav2Vec 2.0 models outperform their multilingual counterpart, XLS-R, in most languages, achieving the highest performance in Greek, Egyptian Arabic."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dasaro-etal-2024-using">
<titleInfo>
<title>Using Large Speech Models for Feature Extraction in Cross-Lingual Speech Emotion Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Federico</namePart>
<namePart type="family">D’asaro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">José</namePart>
<namePart type="family">Márquez Villacís</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Rizzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Bottino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felice</namePart>
<namePart type="family">Dell’Orletta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simonetta</namePart>
<namePart type="family">Montemagni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachele</namePart>
<namePart type="family">Sprugnoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>CEUR Workshop Proceedings</publisher>
<place>
<placeTerm type="text">Pisa, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-12-210-7060-6</identifier>
</relatedItem>
<abstract>Large Speech Models (LSMs), pre-trained on extensive unlabeled data using Self-Supervised Learning (SSL) or WeaklySupervised Learning (WSL), are increasingly employed for tasks like Speech Emotion Recognition (SER). Their capability to extract general-purpose features makes them a strong alternative to low-level descriptors. Most studies focus on English, with limited research on other languages. We evaluate English-Only and Multilingual LSMs from the Wav2Vec 2.0 and Whisper families as feature extractors for SER in eight languages. We have stacked three alternative downstream classifiers of increasing complexity, named Linear, Non-Linear, and Multi-Layer, on top of the LSMs. Results indicate that Whisper models perform best with a simple linear classifier using features from the last transformer layer, while Wav2Vec 2.0 models benefit from features from the middle and early transformer layers. When comparing English-Only and Multilingual LSMs, we find that Whisper models benefit from multilingual pre-training, excelling in Italian, Canadian French, French, Spanish, German and competitively on Greek, Egyptian Arabic, Persian. In contrast, English-Only Wav2Vec 2.0 models outperform their multilingual counterpart, XLS-R, in most languages, achieving the highest performance in Greek, Egyptian Arabic.</abstract>
<identifier type="citekey">dasaro-etal-2024-using</identifier>
<location>
<url>https://aclanthology.org/2024.clicit-1.31/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>258</start>
<end>265</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Using Large Speech Models for Feature Extraction in Cross-Lingual Speech Emotion Recognition
%A D’asaro, Federico
%A Márquez Villacís, Juan José
%A Rizzo, Giuseppe
%A Bottino, Andrea
%Y Dell’Orletta, Felice
%Y Lenci, Alessandro
%Y Montemagni, Simonetta
%Y Sprugnoli, Rachele
%S Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)
%D 2024
%8 December
%I CEUR Workshop Proceedings
%C Pisa, Italy
%@ 979-12-210-7060-6
%F dasaro-etal-2024-using
%X Large Speech Models (LSMs), pre-trained on extensive unlabeled data using Self-Supervised Learning (SSL) or WeaklySupervised Learning (WSL), are increasingly employed for tasks like Speech Emotion Recognition (SER). Their capability to extract general-purpose features makes them a strong alternative to low-level descriptors. Most studies focus on English, with limited research on other languages. We evaluate English-Only and Multilingual LSMs from the Wav2Vec 2.0 and Whisper families as feature extractors for SER in eight languages. We have stacked three alternative downstream classifiers of increasing complexity, named Linear, Non-Linear, and Multi-Layer, on top of the LSMs. Results indicate that Whisper models perform best with a simple linear classifier using features from the last transformer layer, while Wav2Vec 2.0 models benefit from features from the middle and early transformer layers. When comparing English-Only and Multilingual LSMs, we find that Whisper models benefit from multilingual pre-training, excelling in Italian, Canadian French, French, Spanish, German and competitively on Greek, Egyptian Arabic, Persian. In contrast, English-Only Wav2Vec 2.0 models outperform their multilingual counterpart, XLS-R, in most languages, achieving the highest performance in Greek, Egyptian Arabic.
%U https://aclanthology.org/2024.clicit-1.31/
%P 258-265
Markdown (Informal)
[Using Large Speech Models for Feature Extraction in Cross-Lingual Speech Emotion Recognition](https://aclanthology.org/2024.clicit-1.31/) (D’asaro et al., CLiC-it 2024)
ACL