@inproceedings{zhu-etal-2026-fast,
title = "Fast Retrieval and Slow Reasoning for Explainable Multimodal Sentiment Analysis",
author = "Zhu, Aoqiang and
Hu, Min and
Xing, Yan",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1519/",
pages = "30381--30391",
ISBN = "979-8-89176-395-1",
abstract = "Most existing Multimodal Sentiment Analysis (MSA) methods rely on holistic fusion, treating all modalities and temporal segments equally. Such strategies often introduce redundant information and obscure the decision process, limiting both robustness and interpretability. Inspired by dual-process theory, we propose FRSR (Fast Retrieval and Slow Reasoning), an interpretable framework that decomposes multimodal sentiment modeling into two cooperative pathways. The Fast Pathway acts as a lightweight evidence selector, using context-aware convolution and auxiliary supervision to retrieve a sparse set of Top-$K$ sentiment-relevant cues from noisy multimodal inputs. Based on these cues, the Slow Pathway performs deeper cross-modal reasoning through learnable reasoning tokens, enabling hierarchical sentiment inference. By separating salient evidence retrieval from multimodal reasoning, FRSR improves interpretability while reducing computational cost. Experiments on three benchmark datasets show that FRSR achieves competitive performance, higher efficiency, stronger robustness to noise, and clearer decision transparency than existing holistic fusion methods."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhu-etal-2026-fast">
<titleInfo>
<title>Fast Retrieval and Slow Reasoning for Explainable Multimodal Sentiment Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aoqiang</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yan</namePart>
<namePart type="family">Xing</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Most existing Multimodal Sentiment Analysis (MSA) methods rely on holistic fusion, treating all modalities and temporal segments equally. Such strategies often introduce redundant information and obscure the decision process, limiting both robustness and interpretability. Inspired by dual-process theory, we propose FRSR (Fast Retrieval and Slow Reasoning), an interpretable framework that decomposes multimodal sentiment modeling into two cooperative pathways. The Fast Pathway acts as a lightweight evidence selector, using context-aware convolution and auxiliary supervision to retrieve a sparse set of Top-K sentiment-relevant cues from noisy multimodal inputs. Based on these cues, the Slow Pathway performs deeper cross-modal reasoning through learnable reasoning tokens, enabling hierarchical sentiment inference. By separating salient evidence retrieval from multimodal reasoning, FRSR improves interpretability while reducing computational cost. Experiments on three benchmark datasets show that FRSR achieves competitive performance, higher efficiency, stronger robustness to noise, and clearer decision transparency than existing holistic fusion methods.</abstract>
<identifier type="citekey">zhu-etal-2026-fast</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1519/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>30381</start>
<end>30391</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Fast Retrieval and Slow Reasoning for Explainable Multimodal Sentiment Analysis
%A Zhu, Aoqiang
%A Hu, Min
%A Xing, Yan
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F zhu-etal-2026-fast
%X Most existing Multimodal Sentiment Analysis (MSA) methods rely on holistic fusion, treating all modalities and temporal segments equally. Such strategies often introduce redundant information and obscure the decision process, limiting both robustness and interpretability. Inspired by dual-process theory, we propose FRSR (Fast Retrieval and Slow Reasoning), an interpretable framework that decomposes multimodal sentiment modeling into two cooperative pathways. The Fast Pathway acts as a lightweight evidence selector, using context-aware convolution and auxiliary supervision to retrieve a sparse set of Top-K sentiment-relevant cues from noisy multimodal inputs. Based on these cues, the Slow Pathway performs deeper cross-modal reasoning through learnable reasoning tokens, enabling hierarchical sentiment inference. By separating salient evidence retrieval from multimodal reasoning, FRSR improves interpretability while reducing computational cost. Experiments on three benchmark datasets show that FRSR achieves competitive performance, higher efficiency, stronger robustness to noise, and clearer decision transparency than existing holistic fusion methods.
%U https://aclanthology.org/2026.findings-acl.1519/
%P 30381-30391
Markdown (Informal)
[Fast Retrieval and Slow Reasoning for Explainable Multimodal Sentiment Analysis](https://aclanthology.org/2026.findings-acl.1519/) (Zhu et al., Findings 2026)
ACL