@inproceedings{tian-etal-2022-dual,
title = "Dual Capsule Attention Mask Network with Mutual Learning for Visual Question Answering",
author = "Tian, Weidong and
Li, Haodong and
Zhao, Zhong-Qiu",
editor = "Calzolari, Nicoletta and
Huang, Chu-Ren and
Kim, Hansaem and
Pustejovsky, James and
Wanner, Leo and
Choi, Key-Sun and
Ryu, Pum-Mo and
Chen, Hsin-Hsi and
Donatelli, Lucia and
Ji, Heng and
Kurohashi, Sadao and
Paggio, Patrizia and
Xue, Nianwen and
Kim, Seokhwan and
Hahm, Younggyun and
He, Zhong and
Lee, Tony Kyungil and
Santus, Enrico and
Bond, Francis and
Na, Seung-Hoon",
booktitle = "Proceedings of the 29th International Conference on Computational Linguistics",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2022.coling-1.500",
pages = "5678--5688",
abstract = "A Visual Question Answering (VQA) model processes images and questions simultaneously with rich semantic information. The attention mechanism can highlight fine-grained features with critical information, thus ensuring that feature extraction emphasizes the objects related to the questions. However, unattended coarse-grained information is also essential for questions involving global elements. We believe that global coarse-grained information and local fine-grained information can complement each other to provide richer comprehensive information. In this paper, we propose a dual capsule attention mask network with mutual learning for VQA. Specifically, it contains two branches processing coarse-grained features and fine-grained features, respectively. We also design a novel stackable dual capsule attention module to fuse features and locate evidence. The two branches are combined to make final predictions for VQA. Experimental results show that our method outperforms the baselines in terms of VQA performance and interpretability and achieves new SOTA performance on the VQA-v2 dataset.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tian-etal-2022-dual">
<titleInfo>
<title>Dual Capsule Attention Mask Network with Mutual Learning for Visual Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Weidong</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haodong</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhong-Qiu</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 29th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chu-Ren</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hansaem</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Pustejovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Key-Sun</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pum-Mo</namePart>
<namePart type="family">Ryu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hsin-Hsi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Donatelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sadao</namePart>
<namePart type="family">Kurohashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrizia</namePart>
<namePart type="family">Paggio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Younggyun</namePart>
<namePart type="family">Hahm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhong</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tony</namePart>
<namePart type="given">Kyungil</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francis</namePart>
<namePart type="family">Bond</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seung-Hoon</namePart>
<namePart type="family">Na</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A Visual Question Answering (VQA) model processes images and questions simultaneously with rich semantic information. The attention mechanism can highlight fine-grained features with critical information, thus ensuring that feature extraction emphasizes the objects related to the questions. However, unattended coarse-grained information is also essential for questions involving global elements. We believe that global coarse-grained information and local fine-grained information can complement each other to provide richer comprehensive information. In this paper, we propose a dual capsule attention mask network with mutual learning for VQA. Specifically, it contains two branches processing coarse-grained features and fine-grained features, respectively. We also design a novel stackable dual capsule attention module to fuse features and locate evidence. The two branches are combined to make final predictions for VQA. Experimental results show that our method outperforms the baselines in terms of VQA performance and interpretability and achieves new SOTA performance on the VQA-v2 dataset.</abstract>
<identifier type="citekey">tian-etal-2022-dual</identifier>
<location>
<url>https://aclanthology.org/2022.coling-1.500</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>5678</start>
<end>5688</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dual Capsule Attention Mask Network with Mutual Learning for Visual Question Answering
%A Tian, Weidong
%A Li, Haodong
%A Zhao, Zhong-Qiu
%Y Calzolari, Nicoletta
%Y Huang, Chu-Ren
%Y Kim, Hansaem
%Y Pustejovsky, James
%Y Wanner, Leo
%Y Choi, Key-Sun
%Y Ryu, Pum-Mo
%Y Chen, Hsin-Hsi
%Y Donatelli, Lucia
%Y Ji, Heng
%Y Kurohashi, Sadao
%Y Paggio, Patrizia
%Y Xue, Nianwen
%Y Kim, Seokhwan
%Y Hahm, Younggyun
%Y He, Zhong
%Y Lee, Tony Kyungil
%Y Santus, Enrico
%Y Bond, Francis
%Y Na, Seung-Hoon
%S Proceedings of the 29th International Conference on Computational Linguistics
%D 2022
%8 October
%I International Committee on Computational Linguistics
%C Gyeongju, Republic of Korea
%F tian-etal-2022-dual
%X A Visual Question Answering (VQA) model processes images and questions simultaneously with rich semantic information. The attention mechanism can highlight fine-grained features with critical information, thus ensuring that feature extraction emphasizes the objects related to the questions. However, unattended coarse-grained information is also essential for questions involving global elements. We believe that global coarse-grained information and local fine-grained information can complement each other to provide richer comprehensive information. In this paper, we propose a dual capsule attention mask network with mutual learning for VQA. Specifically, it contains two branches processing coarse-grained features and fine-grained features, respectively. We also design a novel stackable dual capsule attention module to fuse features and locate evidence. The two branches are combined to make final predictions for VQA. Experimental results show that our method outperforms the baselines in terms of VQA performance and interpretability and achieves new SOTA performance on the VQA-v2 dataset.
%U https://aclanthology.org/2022.coling-1.500
%P 5678-5688
Markdown (Informal)
[Dual Capsule Attention Mask Network with Mutual Learning for Visual Question Answering](https://aclanthology.org/2022.coling-1.500) (Tian et al., COLING 2022)
ACL