@inproceedings{urooj-etal-2020-mmft,
title = "{MMFT-BERT}: {M}ultimodal {F}usion {T}ransformer with {B}{ERT} {E}ncodings for {V}isual {Q}uestion {A}nswering",
author = "Urooj, Aisha and
Mazaheri, Amir and
Da vitoria lobo, Niels and
Shah, Mubarak",
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.findings-emnlp.417",
doi = "10.18653/v1/2020.findings-emnlp.417",
pages = "4648--4660",
abstract = "We present MMFT-BERT(MultiModal FusionTransformer with BERT encodings), to solve Visual Question Answering (VQA) ensuring individual and combined processing of multiple input modalities. Our approach benefits from processing multimodal data (video and text) adopting the BERT encodings individually and using a novel transformer-based fusion method to fuse them together. Our method decomposes the different sources of modalities, into different BERT instances with similar architectures, but variable weights. This achieves SOTA results on the TVQA dataset. Additionally, we provide TVQA-Visual, an isolated diagnostic subset of TVQA, which strictly requires the knowledge of visual (V) modality based on a human annotator{'}s judgment. This set of questions helps us to study the model{'}s behavior and the challenges TVQA poses to prevent the achievement of super human performance. Extensive experiments show the effectiveness and superiority of our method.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="urooj-etal-2020-mmft">
<titleInfo>
<title>MMFT-BERT: Multimodal Fusion Transformer with BERT Encodings for Visual Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aisha</namePart>
<namePart type="family">Urooj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Mazaheri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niels</namePart>
<namePart type="family">Da vitoria lobo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mubarak</namePart>
<namePart type="family">Shah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2020</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present MMFT-BERT(MultiModal FusionTransformer with BERT encodings), to solve Visual Question Answering (VQA) ensuring individual and combined processing of multiple input modalities. Our approach benefits from processing multimodal data (video and text) adopting the BERT encodings individually and using a novel transformer-based fusion method to fuse them together. Our method decomposes the different sources of modalities, into different BERT instances with similar architectures, but variable weights. This achieves SOTA results on the TVQA dataset. Additionally, we provide TVQA-Visual, an isolated diagnostic subset of TVQA, which strictly requires the knowledge of visual (V) modality based on a human annotator’s judgment. This set of questions helps us to study the model’s behavior and the challenges TVQA poses to prevent the achievement of super human performance. Extensive experiments show the effectiveness and superiority of our method.</abstract>
<identifier type="citekey">urooj-etal-2020-mmft</identifier>
<identifier type="doi">10.18653/v1/2020.findings-emnlp.417</identifier>
<location>
<url>https://aclanthology.org/2020.findings-emnlp.417</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>4648</start>
<end>4660</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MMFT-BERT: Multimodal Fusion Transformer with BERT Encodings for Visual Question Answering
%A Urooj, Aisha
%A Mazaheri, Amir
%A Da vitoria lobo, Niels
%A Shah, Mubarak
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Findings of the Association for Computational Linguistics: EMNLP 2020
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F urooj-etal-2020-mmft
%X We present MMFT-BERT(MultiModal FusionTransformer with BERT encodings), to solve Visual Question Answering (VQA) ensuring individual and combined processing of multiple input modalities. Our approach benefits from processing multimodal data (video and text) adopting the BERT encodings individually and using a novel transformer-based fusion method to fuse them together. Our method decomposes the different sources of modalities, into different BERT instances with similar architectures, but variable weights. This achieves SOTA results on the TVQA dataset. Additionally, we provide TVQA-Visual, an isolated diagnostic subset of TVQA, which strictly requires the knowledge of visual (V) modality based on a human annotator’s judgment. This set of questions helps us to study the model’s behavior and the challenges TVQA poses to prevent the achievement of super human performance. Extensive experiments show the effectiveness and superiority of our method.
%R 10.18653/v1/2020.findings-emnlp.417
%U https://aclanthology.org/2020.findings-emnlp.417
%U https://doi.org/10.18653/v1/2020.findings-emnlp.417
%P 4648-4660
Markdown (Informal)
[MMFT-BERT: Multimodal Fusion Transformer with BERT Encodings for Visual Question Answering](https://aclanthology.org/2020.findings-emnlp.417) (Urooj et al., Findings 2020)
ACL