@inproceedings{bellver-soler-etal-2025-cutting,
title = "Cutting Through Overload: Efficient Token Dropping for Speech Emotion Recognition in Multimodal Large Language Models",
author = "Bellver-Soler, Jaime and
Rodriguez-Cantelar, Mario and
C{\'o}rdoba, Ricardo and
D{'}Haro, Luis Fernando",
editor = "Torres, Maria Ines and
Matsuda, Yuki and
Callejas, Zoraida and
del Pozo, Arantza and
D'Haro, Luis Fernando",
booktitle = "Proceedings of the 15th International Workshop on Spoken Dialogue Systems Technology",
month = may,
year = "2025",
address = "Bilbao, Spain",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.iwsds-1.30/",
pages = "284--289",
ISBN = "979-8-89176-248-0",
abstract = "Recent developments in Multimodal Large Language Models (MLLMs) have provided novel insights into Speech Emotion Recognition (SER). However, combining high-dimensional speech signals with textual tokens can lead to a rapid growth in input tokens, increasing computational costs and inference times. This ``token overload'' also risks shadowing essential textual cues, affecting the reasoning capabilities of the language model and diluting emotional information crucial to accurate SER. In this paper, we explore different token drop methods that mitigate excessive token counts while preserving both emotional nuances and the core linguistic capabilities of the model. Specifically, we compare various pooling approaches to produce a compact representation. Our preliminary findings suggest that these techniques can reduce computational costs without decreasing SER accuracy."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bellver-soler-etal-2025-cutting">
<titleInfo>
<title>Cutting Through Overload: Efficient Token Dropping for Speech Emotion Recognition in Multimodal Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jaime</namePart>
<namePart type="family">Bellver-Soler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mario</namePart>
<namePart type="family">Rodriguez-Cantelar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ricardo</namePart>
<namePart type="family">Córdoba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="given">Fernando</namePart>
<namePart type="family">D’Haro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Workshop on Spoken Dialogue Systems Technology</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Ines</namePart>
<namePart type="family">Torres</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuki</namePart>
<namePart type="family">Matsuda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zoraida</namePart>
<namePart type="family">Callejas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arantza</namePart>
<namePart type="family">del Pozo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="given">Fernando</namePart>
<namePart type="family">D’Haro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bilbao, Spain</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-248-0</identifier>
</relatedItem>
<abstract>Recent developments in Multimodal Large Language Models (MLLMs) have provided novel insights into Speech Emotion Recognition (SER). However, combining high-dimensional speech signals with textual tokens can lead to a rapid growth in input tokens, increasing computational costs and inference times. This “token overload” also risks shadowing essential textual cues, affecting the reasoning capabilities of the language model and diluting emotional information crucial to accurate SER. In this paper, we explore different token drop methods that mitigate excessive token counts while preserving both emotional nuances and the core linguistic capabilities of the model. Specifically, we compare various pooling approaches to produce a compact representation. Our preliminary findings suggest that these techniques can reduce computational costs without decreasing SER accuracy.</abstract>
<identifier type="citekey">bellver-soler-etal-2025-cutting</identifier>
<location>
<url>https://aclanthology.org/2025.iwsds-1.30/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>284</start>
<end>289</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Cutting Through Overload: Efficient Token Dropping for Speech Emotion Recognition in Multimodal Large Language Models
%A Bellver-Soler, Jaime
%A Rodriguez-Cantelar, Mario
%A Córdoba, Ricardo
%A D’Haro, Luis Fernando
%Y Torres, Maria Ines
%Y Matsuda, Yuki
%Y Callejas, Zoraida
%Y del Pozo, Arantza
%Y D’Haro, Luis Fernando
%S Proceedings of the 15th International Workshop on Spoken Dialogue Systems Technology
%D 2025
%8 May
%I Association for Computational Linguistics
%C Bilbao, Spain
%@ 979-8-89176-248-0
%F bellver-soler-etal-2025-cutting
%X Recent developments in Multimodal Large Language Models (MLLMs) have provided novel insights into Speech Emotion Recognition (SER). However, combining high-dimensional speech signals with textual tokens can lead to a rapid growth in input tokens, increasing computational costs and inference times. This “token overload” also risks shadowing essential textual cues, affecting the reasoning capabilities of the language model and diluting emotional information crucial to accurate SER. In this paper, we explore different token drop methods that mitigate excessive token counts while preserving both emotional nuances and the core linguistic capabilities of the model. Specifically, we compare various pooling approaches to produce a compact representation. Our preliminary findings suggest that these techniques can reduce computational costs without decreasing SER accuracy.
%U https://aclanthology.org/2025.iwsds-1.30/
%P 284-289
Markdown (Informal)
[Cutting Through Overload: Efficient Token Dropping for Speech Emotion Recognition in Multimodal Large Language Models](https://aclanthology.org/2025.iwsds-1.30/) (Bellver-Soler et al., IWSDS 2025)
ACL