@inproceedings{zhang-etal-2025-modal,
title = "Modal Feature Optimization Network with Prompt for Multimodal Sentiment Analysis",
author = "Zhang, Xiangmin and
Wei, Wei and
Zou, Shihao",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.309/",
pages = "4611--4621",
abstract = "Multimodal sentiment analysis(MSA) is mostly used to understand human emotional states through multimodal. However, due to the fact that the effective information carried by multimodal is not balanced, the modality containing less effective information cannot fully play the complementary role between modalities. Therefore, the goal of this paper is to fully explore the effective information in modalities and further optimize the under-optimized modal representation.To this end, we propose a novel \textbf{M}odal \textbf{F}eature \textbf{O}ptimization \textbf{N}etwork (MFON) with a \textbf{M}odal \textbf{P}rompt \textbf{A}ttention (MPA) mechanism for MSA. Specifically, we first determine which modalities are under-optimized in MSA, and then use relevant prompt information to focus the model on these features. This allows the model to focus more on the features of the modalities that need optimization, improving the utilization of each modality`s feature representation and facilitating initial information aggregation across modalities. Subsequently, we design an intra-modal knowledge distillation strategy for under-optimized modalities. This approach preserves the integrity of the modal features. Furthermore, we implement inter-modal contrastive learning to better extract related features across modalities, thereby optimizing the entire network. Finally, sentiment prediction is carried out through the effective fusion of multimodal information. Extensive experimental results on public benchmark datasets demonstrate that our proposed method outperforms existing state-of-the-art models."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2025-modal">
<titleInfo>
<title>Modal Feature Optimization Network with Prompt for Multimodal Sentiment Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiangmin</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shihao</namePart>
<namePart type="family">Zou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multimodal sentiment analysis(MSA) is mostly used to understand human emotional states through multimodal. However, due to the fact that the effective information carried by multimodal is not balanced, the modality containing less effective information cannot fully play the complementary role between modalities. Therefore, the goal of this paper is to fully explore the effective information in modalities and further optimize the under-optimized modal representation.To this end, we propose a novel Modal Feature Optimization Network (MFON) with a Modal Prompt Attention (MPA) mechanism for MSA. Specifically, we first determine which modalities are under-optimized in MSA, and then use relevant prompt information to focus the model on these features. This allows the model to focus more on the features of the modalities that need optimization, improving the utilization of each modality‘s feature representation and facilitating initial information aggregation across modalities. Subsequently, we design an intra-modal knowledge distillation strategy for under-optimized modalities. This approach preserves the integrity of the modal features. Furthermore, we implement inter-modal contrastive learning to better extract related features across modalities, thereby optimizing the entire network. Finally, sentiment prediction is carried out through the effective fusion of multimodal information. Extensive experimental results on public benchmark datasets demonstrate that our proposed method outperforms existing state-of-the-art models.</abstract>
<identifier type="citekey">zhang-etal-2025-modal</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.309/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>4611</start>
<end>4621</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Modal Feature Optimization Network with Prompt for Multimodal Sentiment Analysis
%A Zhang, Xiangmin
%A Wei, Wei
%A Zou, Shihao
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F zhang-etal-2025-modal
%X Multimodal sentiment analysis(MSA) is mostly used to understand human emotional states through multimodal. However, due to the fact that the effective information carried by multimodal is not balanced, the modality containing less effective information cannot fully play the complementary role between modalities. Therefore, the goal of this paper is to fully explore the effective information in modalities and further optimize the under-optimized modal representation.To this end, we propose a novel Modal Feature Optimization Network (MFON) with a Modal Prompt Attention (MPA) mechanism for MSA. Specifically, we first determine which modalities are under-optimized in MSA, and then use relevant prompt information to focus the model on these features. This allows the model to focus more on the features of the modalities that need optimization, improving the utilization of each modality‘s feature representation and facilitating initial information aggregation across modalities. Subsequently, we design an intra-modal knowledge distillation strategy for under-optimized modalities. This approach preserves the integrity of the modal features. Furthermore, we implement inter-modal contrastive learning to better extract related features across modalities, thereby optimizing the entire network. Finally, sentiment prediction is carried out through the effective fusion of multimodal information. Extensive experimental results on public benchmark datasets demonstrate that our proposed method outperforms existing state-of-the-art models.
%U https://aclanthology.org/2025.coling-main.309/
%P 4611-4621
Markdown (Informal)
[Modal Feature Optimization Network with Prompt for Multimodal Sentiment Analysis](https://aclanthology.org/2025.coling-main.309/) (Zhang et al., COLING 2025)
ACL