@inproceedings{jin-etal-2024-rethinking,
title = "Rethinking the Multimodal Correlation of Multimodal Sequential Learning via Generalizable Attentional Results Alignment",
author = "Jin, Tao and
Lin, Wang and
Wang, Ye and
Li, Linjun and
Cheng, Xize and
Zhao, Zhou",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-long.287/",
doi = "10.18653/v1/2024.acl-long.287",
pages = "5247--5265",
abstract = "Transformer-based methods have gone mainstream in multimodal sequential learning. The intra and inter modality interactions are captured by the query-key associations of multi-head attention. In this way, the calculated multimodal contexts (attentional results) are expected to be relevant to the query modality. However, in existing literature, the alignment degree between different calculated attentional results of the same query are under-explored. Based on this concern, we propose a new constrained scheme called Multimodal Contextual Contrast (MCC), which could align the multiple attentional results from both local and global perspectives, making the information capture more efficient. Concretely, the calculated attentional results of different modalities are mapped into a common feature space, those attentional vectors with the same query are considered as a positive group and the remaining sets are negative. From local perspective, we sample the negative groups for a positive group by randomly changing the sequential step of one specific context and keeping the other stay the same. From coarse global perspective, we divide all the contextual groups into two sets (i.e., aligned and unaligned), making the total score of aligned group relatively large. We extend the vectorial inner product operation for more input and calculate the aligned score for each multimodal group. Considering that the computational complexity scales exponentially to the number of modalities, we adopt stochastic expectation approximation (SEA) for the real process. The extensive experimental results on several tasks reveal the effectiveness of our contributions."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jin-etal-2024-rethinking">
<titleInfo>
<title>Rethinking the Multimodal Correlation of Multimodal Sequential Learning via Generalizable Attentional Results Alignment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tao</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wang</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ye</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linjun</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xize</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhou</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Transformer-based methods have gone mainstream in multimodal sequential learning. The intra and inter modality interactions are captured by the query-key associations of multi-head attention. In this way, the calculated multimodal contexts (attentional results) are expected to be relevant to the query modality. However, in existing literature, the alignment degree between different calculated attentional results of the same query are under-explored. Based on this concern, we propose a new constrained scheme called Multimodal Contextual Contrast (MCC), which could align the multiple attentional results from both local and global perspectives, making the information capture more efficient. Concretely, the calculated attentional results of different modalities are mapped into a common feature space, those attentional vectors with the same query are considered as a positive group and the remaining sets are negative. From local perspective, we sample the negative groups for a positive group by randomly changing the sequential step of one specific context and keeping the other stay the same. From coarse global perspective, we divide all the contextual groups into two sets (i.e., aligned and unaligned), making the total score of aligned group relatively large. We extend the vectorial inner product operation for more input and calculate the aligned score for each multimodal group. Considering that the computational complexity scales exponentially to the number of modalities, we adopt stochastic expectation approximation (SEA) for the real process. The extensive experimental results on several tasks reveal the effectiveness of our contributions.</abstract>
<identifier type="citekey">jin-etal-2024-rethinking</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.287</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-long.287/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>5247</start>
<end>5265</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Rethinking the Multimodal Correlation of Multimodal Sequential Learning via Generalizable Attentional Results Alignment
%A Jin, Tao
%A Lin, Wang
%A Wang, Ye
%A Li, Linjun
%A Cheng, Xize
%A Zhao, Zhou
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F jin-etal-2024-rethinking
%X Transformer-based methods have gone mainstream in multimodal sequential learning. The intra and inter modality interactions are captured by the query-key associations of multi-head attention. In this way, the calculated multimodal contexts (attentional results) are expected to be relevant to the query modality. However, in existing literature, the alignment degree between different calculated attentional results of the same query are under-explored. Based on this concern, we propose a new constrained scheme called Multimodal Contextual Contrast (MCC), which could align the multiple attentional results from both local and global perspectives, making the information capture more efficient. Concretely, the calculated attentional results of different modalities are mapped into a common feature space, those attentional vectors with the same query are considered as a positive group and the remaining sets are negative. From local perspective, we sample the negative groups for a positive group by randomly changing the sequential step of one specific context and keeping the other stay the same. From coarse global perspective, we divide all the contextual groups into two sets (i.e., aligned and unaligned), making the total score of aligned group relatively large. We extend the vectorial inner product operation for more input and calculate the aligned score for each multimodal group. Considering that the computational complexity scales exponentially to the number of modalities, we adopt stochastic expectation approximation (SEA) for the real process. The extensive experimental results on several tasks reveal the effectiveness of our contributions.
%R 10.18653/v1/2024.acl-long.287
%U https://aclanthology.org/2024.luhme-long.287/
%U https://doi.org/10.18653/v1/2024.acl-long.287
%P 5247-5265
Markdown (Informal)
[Rethinking the Multimodal Correlation of Multimodal Sequential Learning via Generalizable Attentional Results Alignment](https://aclanthology.org/2024.luhme-long.287/) (Jin et al., ACL 2024)
ACL