@inproceedings{zhang-etal-2026-coarse,
title = "Coarse-to-Fine Multimodal Information Selection for Video Speaking Style Recognition with Large Language Models",
author = "Zhang, Beibei and
Lu, Yanan and
Fen, Lin and
Ren, Tongwei",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1466/",
pages = "29322--29337",
ISBN = "979-8-89176-395-1",
abstract = "Video Speaking Style Recognition (VSSR) aims to classify conversation videos into different types, significantly facilitating human interaction understanding. Recent approaches explore the potential of large language models (LLM) in VSSR with a training-free process. However, directly integrating all multimodal data yields suboptimal results, since the great redundancy in visual data can overshadow other valuable multimodal information, such as valuable textual dialogues and critical visual clues. To address this, we propose CFMiS (Coarse-to-Fine Multimodal Information Selection), a novel framework for VSSR that dynamically obtain valuable multimodal data via coarse-to-fine selection, enhancing LLM reasoning for VSSR. Specifically, the core of CFMiS are two cascaded modules: 1) a text-dominant modality selection module firstly selects VSSR-required modalities originating from text-based prediction; and 2) if vision is included in the selected modalities, a visual refinement module iteratively collects VSSR-relevant critical visual clues. The former resolves which modality to utilize, while the latter determines which information to adopt from selected modalities, efficiently alleviating information redundancy. Extensive experiments on multiple datasets prove that CFMiS is highly effective for VSSR, outperforming all existing training-free approaches and most training-based methods."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2026-coarse">
<titleInfo>
<title>Coarse-to-Fine Multimodal Information Selection for Video Speaking Style Recognition with Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Beibei</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanan</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lin</namePart>
<namePart type="family">Fen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tongwei</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Video Speaking Style Recognition (VSSR) aims to classify conversation videos into different types, significantly facilitating human interaction understanding. Recent approaches explore the potential of large language models (LLM) in VSSR with a training-free process. However, directly integrating all multimodal data yields suboptimal results, since the great redundancy in visual data can overshadow other valuable multimodal information, such as valuable textual dialogues and critical visual clues. To address this, we propose CFMiS (Coarse-to-Fine Multimodal Information Selection), a novel framework for VSSR that dynamically obtain valuable multimodal data via coarse-to-fine selection, enhancing LLM reasoning for VSSR. Specifically, the core of CFMiS are two cascaded modules: 1) a text-dominant modality selection module firstly selects VSSR-required modalities originating from text-based prediction; and 2) if vision is included in the selected modalities, a visual refinement module iteratively collects VSSR-relevant critical visual clues. The former resolves which modality to utilize, while the latter determines which information to adopt from selected modalities, efficiently alleviating information redundancy. Extensive experiments on multiple datasets prove that CFMiS is highly effective for VSSR, outperforming all existing training-free approaches and most training-based methods.</abstract>
<identifier type="citekey">zhang-etal-2026-coarse</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1466/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>29322</start>
<end>29337</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Coarse-to-Fine Multimodal Information Selection for Video Speaking Style Recognition with Large Language Models
%A Zhang, Beibei
%A Lu, Yanan
%A Fen, Lin
%A Ren, Tongwei
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F zhang-etal-2026-coarse
%X Video Speaking Style Recognition (VSSR) aims to classify conversation videos into different types, significantly facilitating human interaction understanding. Recent approaches explore the potential of large language models (LLM) in VSSR with a training-free process. However, directly integrating all multimodal data yields suboptimal results, since the great redundancy in visual data can overshadow other valuable multimodal information, such as valuable textual dialogues and critical visual clues. To address this, we propose CFMiS (Coarse-to-Fine Multimodal Information Selection), a novel framework for VSSR that dynamically obtain valuable multimodal data via coarse-to-fine selection, enhancing LLM reasoning for VSSR. Specifically, the core of CFMiS are two cascaded modules: 1) a text-dominant modality selection module firstly selects VSSR-required modalities originating from text-based prediction; and 2) if vision is included in the selected modalities, a visual refinement module iteratively collects VSSR-relevant critical visual clues. The former resolves which modality to utilize, while the latter determines which information to adopt from selected modalities, efficiently alleviating information redundancy. Extensive experiments on multiple datasets prove that CFMiS is highly effective for VSSR, outperforming all existing training-free approaches and most training-based methods.
%U https://aclanthology.org/2026.findings-acl.1466/
%P 29322-29337
Markdown (Informal)
[Coarse-to-Fine Multimodal Information Selection for Video Speaking Style Recognition with Large Language Models](https://aclanthology.org/2026.findings-acl.1466/) (Zhang et al., Findings 2026)
ACL