@inproceedings{chen-etal-2025-grpo,
title = "{GRPO}-Guided Modality Selection Enhanced {L}o{RA}-Tuned {LLM}s for Multimodal Emotion Recognition",
author = "Chen, Yang and
Yang, Shuwan and
Xiang, Yan and
Song, Ran and
Huang, Yuxin and
Yu, Zhengtao",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1059/",
doi = "10.18653/v1/2025.findings-emnlp.1059",
pages = "19458--19471",
ISBN = "979-8-89176-335-7",
abstract = "Multimodal emotion recognition in conversation (MERC) aims to identify speakers' emotional states by utilizing text, audio, and visual modalities. Although recent large language model (LLM)-based methods have demonstrated strong performance, they typically adopt static fusion strategies that integrate all available modalities uniformly. This overlooks the fact that the necessity of multimodal cues can vary significantly across utterances. In this work, we propose an adaptive modality selection framework for MERC. The core of our approach is a modality selection module based on Group Relative Policy Optimization (GRPO), which enables a LoRA-tuned LLM to reason about the necessity of multimodal input via chain-of-thought (CoT) generation. This process does not require manually labeled modality selection data and is trained in a fully unsupervised manner. The selected modality configuration is then provided as input to a downstream emotion classifier, which is also implemented using a LoRA-tuned LLM and trained to predict emotional states. Experimental results on benchmark multimodal dialogue datasets show that our method consistently outperforms strong baselines, demonstrating the effectiveness of adaptive modality selection in improving recognition accuracy. Our code is available at \url{https://github.com/youflyaway/Modality-Selection-Enhanced-LoRA-Tuned-LLMs}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2025-grpo">
<titleInfo>
<title>GRPO-Guided Modality Selection Enhanced LoRA-Tuned LLMs for Multimodal Emotion Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuwan</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yan</namePart>
<namePart type="family">Xiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ran</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuxin</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhengtao</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Multimodal emotion recognition in conversation (MERC) aims to identify speakers’ emotional states by utilizing text, audio, and visual modalities. Although recent large language model (LLM)-based methods have demonstrated strong performance, they typically adopt static fusion strategies that integrate all available modalities uniformly. This overlooks the fact that the necessity of multimodal cues can vary significantly across utterances. In this work, we propose an adaptive modality selection framework for MERC. The core of our approach is a modality selection module based on Group Relative Policy Optimization (GRPO), which enables a LoRA-tuned LLM to reason about the necessity of multimodal input via chain-of-thought (CoT) generation. This process does not require manually labeled modality selection data and is trained in a fully unsupervised manner. The selected modality configuration is then provided as input to a downstream emotion classifier, which is also implemented using a LoRA-tuned LLM and trained to predict emotional states. Experimental results on benchmark multimodal dialogue datasets show that our method consistently outperforms strong baselines, demonstrating the effectiveness of adaptive modality selection in improving recognition accuracy. Our code is available at https://github.com/youflyaway/Modality-Selection-Enhanced-LoRA-Tuned-LLMs.</abstract>
<identifier type="citekey">chen-etal-2025-grpo</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.1059</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1059/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>19458</start>
<end>19471</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GRPO-Guided Modality Selection Enhanced LoRA-Tuned LLMs for Multimodal Emotion Recognition
%A Chen, Yang
%A Yang, Shuwan
%A Xiang, Yan
%A Song, Ran
%A Huang, Yuxin
%A Yu, Zhengtao
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F chen-etal-2025-grpo
%X Multimodal emotion recognition in conversation (MERC) aims to identify speakers’ emotional states by utilizing text, audio, and visual modalities. Although recent large language model (LLM)-based methods have demonstrated strong performance, they typically adopt static fusion strategies that integrate all available modalities uniformly. This overlooks the fact that the necessity of multimodal cues can vary significantly across utterances. In this work, we propose an adaptive modality selection framework for MERC. The core of our approach is a modality selection module based on Group Relative Policy Optimization (GRPO), which enables a LoRA-tuned LLM to reason about the necessity of multimodal input via chain-of-thought (CoT) generation. This process does not require manually labeled modality selection data and is trained in a fully unsupervised manner. The selected modality configuration is then provided as input to a downstream emotion classifier, which is also implemented using a LoRA-tuned LLM and trained to predict emotional states. Experimental results on benchmark multimodal dialogue datasets show that our method consistently outperforms strong baselines, demonstrating the effectiveness of adaptive modality selection in improving recognition accuracy. Our code is available at https://github.com/youflyaway/Modality-Selection-Enhanced-LoRA-Tuned-LLMs.
%R 10.18653/v1/2025.findings-emnlp.1059
%U https://aclanthology.org/2025.findings-emnlp.1059/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.1059
%P 19458-19471
Markdown (Informal)
[GRPO-Guided Modality Selection Enhanced LoRA-Tuned LLMs for Multimodal Emotion Recognition](https://aclanthology.org/2025.findings-emnlp.1059/) (Chen et al., Findings 2025)
ACL