@inproceedings{sahu-vechtomova-2021-adaptive,
title = "Adaptive Fusion Techniques for Multimodal Data",
author = "Sahu, Gaurav and
Vechtomova, Olga",
editor = "Merlo, Paola and
Tiedemann, Jorg and
Tsarfaty, Reut",
booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume",
month = apr,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eacl-main.275",
doi = "10.18653/v1/2021.eacl-main.275",
pages = "3156--3166",
abstract = "Effective fusion of data from multiple modalities, such as video, speech, and text, is challenging due to the heterogeneous nature of multimodal data. In this paper, we propose adaptive fusion techniques that aim to model context from different modalities effectively. Instead of defining a deterministic fusion operation, such as concatenation, for the network, we let the network decide {``}how{''} to combine a given set of multimodal features more effectively. We propose two networks: 1) Auto-Fusion, which learns to compress information from different modalities while preserving the context, and 2) GAN-Fusion, which regularizes the learned latent space given context from complementing modalities. A quantitative evaluation on the tasks of multimodal machine translation and emotion recognition suggests that our lightweight, adaptive networks can better model context from other modalities than existing methods, many of which employ massive transformer-based networks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sahu-vechtomova-2021-adaptive">
<titleInfo>
<title>Adaptive Fusion Techniques for Multimodal Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gaurav</namePart>
<namePart type="family">Sahu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Olga</namePart>
<namePart type="family">Vechtomova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume</title>
</titleInfo>
<name type="personal">
<namePart type="given">Paola</namePart>
<namePart type="family">Merlo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reut</namePart>
<namePart type="family">Tsarfaty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Effective fusion of data from multiple modalities, such as video, speech, and text, is challenging due to the heterogeneous nature of multimodal data. In this paper, we propose adaptive fusion techniques that aim to model context from different modalities effectively. Instead of defining a deterministic fusion operation, such as concatenation, for the network, we let the network decide “how” to combine a given set of multimodal features more effectively. We propose two networks: 1) Auto-Fusion, which learns to compress information from different modalities while preserving the context, and 2) GAN-Fusion, which regularizes the learned latent space given context from complementing modalities. A quantitative evaluation on the tasks of multimodal machine translation and emotion recognition suggests that our lightweight, adaptive networks can better model context from other modalities than existing methods, many of which employ massive transformer-based networks.</abstract>
<identifier type="citekey">sahu-vechtomova-2021-adaptive</identifier>
<identifier type="doi">10.18653/v1/2021.eacl-main.275</identifier>
<location>
<url>https://aclanthology.org/2021.eacl-main.275</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>3156</start>
<end>3166</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Adaptive Fusion Techniques for Multimodal Data
%A Sahu, Gaurav
%A Vechtomova, Olga
%Y Merlo, Paola
%Y Tiedemann, Jorg
%Y Tsarfaty, Reut
%S Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume
%D 2021
%8 April
%I Association for Computational Linguistics
%C Online
%F sahu-vechtomova-2021-adaptive
%X Effective fusion of data from multiple modalities, such as video, speech, and text, is challenging due to the heterogeneous nature of multimodal data. In this paper, we propose adaptive fusion techniques that aim to model context from different modalities effectively. Instead of defining a deterministic fusion operation, such as concatenation, for the network, we let the network decide “how” to combine a given set of multimodal features more effectively. We propose two networks: 1) Auto-Fusion, which learns to compress information from different modalities while preserving the context, and 2) GAN-Fusion, which regularizes the learned latent space given context from complementing modalities. A quantitative evaluation on the tasks of multimodal machine translation and emotion recognition suggests that our lightweight, adaptive networks can better model context from other modalities than existing methods, many of which employ massive transformer-based networks.
%R 10.18653/v1/2021.eacl-main.275
%U https://aclanthology.org/2021.eacl-main.275
%U https://doi.org/10.18653/v1/2021.eacl-main.275
%P 3156-3166
Markdown (Informal)
[Adaptive Fusion Techniques for Multimodal Data](https://aclanthology.org/2021.eacl-main.275) (Sahu & Vechtomova, EACL 2021)
ACL
- Gaurav Sahu and Olga Vechtomova. 2021. Adaptive Fusion Techniques for Multimodal Data. In Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume, pages 3156–3166, Online. Association for Computational Linguistics.