@inproceedings{wang-etal-2024-tmfn,
title = "{TMFN}: A Target-oriented Multi-grained Fusion Network for End-to-end Aspect-based Multimodal Sentiment Analysis",
author = "Wang, Di and
He, Yuzheng and
Liang, Xiao and
Tian, Yumin and
Li, Shaofeng and
Zhao, Lin",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1407",
pages = "16187--16197",
abstract = "End-to-end multimodal aspect-based sentiment analysis (MABSA) combines multimodal aspect terms extraction (MATE) with multimodal aspect sentiment classification (MASC), aiming to simultaneously extract aspect words and classify the sentiment polarity of each aspect. However, existing MABSA methods have overlooked two issues: (i) They only focus on fusing image regional information and textual words for two subtasks of MABSA. Whereas, MATE subtask relies more on global image information to assist in obtaining the quantity and attributes of aspects. Ignoring the integration with global information may affect the performance of MABSA methods. (ii) They fail to take advantage of target information. Nevertheless, the fine-grained details of targets are important for classifying sentiments of aspects. To solve these problems, we propose a Target-oriented Multi-grained Fusion Network(TMFN). It fuses text information with global coarse-grained image information for MATE subtask and with fine-grained image information for MASC subtask. In addition, a target-oriented feature alignment (TOFA) module is designed to enhance target-related information in image features with target details. In such a way, image features will contain more target emotional-related information which is beneficial to sentiment classification. Extensive experiments show that our method outperforms state-of-the-art methods on two benchmark datasets.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2024-tmfn">
<titleInfo>
<title>TMFN: A Target-oriented Multi-grained Fusion Network for End-to-end Aspect-based Multimodal Sentiment Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Di</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuzheng</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiao</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yumin</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shaofeng</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lin</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>End-to-end multimodal aspect-based sentiment analysis (MABSA) combines multimodal aspect terms extraction (MATE) with multimodal aspect sentiment classification (MASC), aiming to simultaneously extract aspect words and classify the sentiment polarity of each aspect. However, existing MABSA methods have overlooked two issues: (i) They only focus on fusing image regional information and textual words for two subtasks of MABSA. Whereas, MATE subtask relies more on global image information to assist in obtaining the quantity and attributes of aspects. Ignoring the integration with global information may affect the performance of MABSA methods. (ii) They fail to take advantage of target information. Nevertheless, the fine-grained details of targets are important for classifying sentiments of aspects. To solve these problems, we propose a Target-oriented Multi-grained Fusion Network(TMFN). It fuses text information with global coarse-grained image information for MATE subtask and with fine-grained image information for MASC subtask. In addition, a target-oriented feature alignment (TOFA) module is designed to enhance target-related information in image features with target details. In such a way, image features will contain more target emotional-related information which is beneficial to sentiment classification. Extensive experiments show that our method outperforms state-of-the-art methods on two benchmark datasets.</abstract>
<identifier type="citekey">wang-etal-2024-tmfn</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1407</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>16187</start>
<end>16197</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TMFN: A Target-oriented Multi-grained Fusion Network for End-to-end Aspect-based Multimodal Sentiment Analysis
%A Wang, Di
%A He, Yuzheng
%A Liang, Xiao
%A Tian, Yumin
%A Li, Shaofeng
%A Zhao, Lin
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F wang-etal-2024-tmfn
%X End-to-end multimodal aspect-based sentiment analysis (MABSA) combines multimodal aspect terms extraction (MATE) with multimodal aspect sentiment classification (MASC), aiming to simultaneously extract aspect words and classify the sentiment polarity of each aspect. However, existing MABSA methods have overlooked two issues: (i) They only focus on fusing image regional information and textual words for two subtasks of MABSA. Whereas, MATE subtask relies more on global image information to assist in obtaining the quantity and attributes of aspects. Ignoring the integration with global information may affect the performance of MABSA methods. (ii) They fail to take advantage of target information. Nevertheless, the fine-grained details of targets are important for classifying sentiments of aspects. To solve these problems, we propose a Target-oriented Multi-grained Fusion Network(TMFN). It fuses text information with global coarse-grained image information for MATE subtask and with fine-grained image information for MASC subtask. In addition, a target-oriented feature alignment (TOFA) module is designed to enhance target-related information in image features with target details. In such a way, image features will contain more target emotional-related information which is beneficial to sentiment classification. Extensive experiments show that our method outperforms state-of-the-art methods on two benchmark datasets.
%U https://aclanthology.org/2024.lrec-main.1407
%P 16187-16197
Markdown (Informal)
[TMFN: A Target-oriented Multi-grained Fusion Network for End-to-end Aspect-based Multimodal Sentiment Analysis](https://aclanthology.org/2024.lrec-main.1407) (Wang et al., LREC-COLING 2024)
ACL