@inproceedings{li-liu-2025-mpid,
title = "{MPID}: A Modality-Preserving and Interaction-Driven Fusion Network for Multimodal Sentiment Analysis",
author = "Li, Tianyi and
Liu, Daming",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.291/",
pages = "4313--4322",
abstract = "The advancement of social media has intensified interest in the research direction of Multimodal Sentiment Analysis (MSA). However, current methodologies exhibit relative limitations, particularly in their fusion mechanisms that overlook nuanced differences and similarities across modalities, leading to potential biases in MSA. In addition, indiscriminate fusion across modalities can introduce unnecessary complexity and noise, undermining the effectiveness of the analysis. In this essay, a Modal-Preserving and Interaction-Driven Fusion Network is introduced to address the aforementioned challenges. The compressed representations of each modality are initially obtained through a Token Refinement Module. Subsequently, we employ a Dual Perception Fusion Module to integrate text with audio and a separate Adaptive Graded Fusion Module for text and visual data. The final step leverages text representation to enhance composite representation. Our experiments on CMU-MOSI, CMU-MOSEI, and CH-SIMS datasets demonstrate that our model achieves state-of-the-art performance."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-liu-2025-mpid">
<titleInfo>
<title>MPID: A Modality-Preserving and Interaction-Driven Fusion Network for Multimodal Sentiment Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tianyi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daming</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The advancement of social media has intensified interest in the research direction of Multimodal Sentiment Analysis (MSA). However, current methodologies exhibit relative limitations, particularly in their fusion mechanisms that overlook nuanced differences and similarities across modalities, leading to potential biases in MSA. In addition, indiscriminate fusion across modalities can introduce unnecessary complexity and noise, undermining the effectiveness of the analysis. In this essay, a Modal-Preserving and Interaction-Driven Fusion Network is introduced to address the aforementioned challenges. The compressed representations of each modality are initially obtained through a Token Refinement Module. Subsequently, we employ a Dual Perception Fusion Module to integrate text with audio and a separate Adaptive Graded Fusion Module for text and visual data. The final step leverages text representation to enhance composite representation. Our experiments on CMU-MOSI, CMU-MOSEI, and CH-SIMS datasets demonstrate that our model achieves state-of-the-art performance.</abstract>
<identifier type="citekey">li-liu-2025-mpid</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.291/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>4313</start>
<end>4322</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MPID: A Modality-Preserving and Interaction-Driven Fusion Network for Multimodal Sentiment Analysis
%A Li, Tianyi
%A Liu, Daming
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F li-liu-2025-mpid
%X The advancement of social media has intensified interest in the research direction of Multimodal Sentiment Analysis (MSA). However, current methodologies exhibit relative limitations, particularly in their fusion mechanisms that overlook nuanced differences and similarities across modalities, leading to potential biases in MSA. In addition, indiscriminate fusion across modalities can introduce unnecessary complexity and noise, undermining the effectiveness of the analysis. In this essay, a Modal-Preserving and Interaction-Driven Fusion Network is introduced to address the aforementioned challenges. The compressed representations of each modality are initially obtained through a Token Refinement Module. Subsequently, we employ a Dual Perception Fusion Module to integrate text with audio and a separate Adaptive Graded Fusion Module for text and visual data. The final step leverages text representation to enhance composite representation. Our experiments on CMU-MOSI, CMU-MOSEI, and CH-SIMS datasets demonstrate that our model achieves state-of-the-art performance.
%U https://aclanthology.org/2025.coling-main.291/
%P 4313-4322
Markdown (Informal)
[MPID: A Modality-Preserving and Interaction-Driven Fusion Network for Multimodal Sentiment Analysis](https://aclanthology.org/2025.coling-main.291/) (Li & Liu, COLING 2025)
ACL