@inproceedings{huang-etal-2019-multi-grained,
title = "Multi-grained Attention with Object-level Grounding for Visual Question Answering",
author = "Huang, Pingping and
Huang, Jianhui and
Guo, Yuqing and
Qiao, Min and
Zhu, Yong",
editor = "Korhonen, Anna and
Traum, David and
M\`arquez, Llu\'\i s",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P19-1349/",
doi = "10.18653/v1/P19-1349",
pages = "3595--3600",
abstract = "Attention mechanisms are widely used in Visual Question Answering (VQA) to search for visual clues related to the question. Most approaches train attention models from a coarse-grained association between sentences and images, which tends to fail on small objects or uncommon concepts. To address this problem, this paper proposes a multi-grained attention method. It learns explicit word-object correspondence by two types of word-level attention complementary to the sentence-image association. Evaluated on the VQA benchmark, the multi-grained attention model achieves competitive performance with state-of-the-art models. And the visualized attention maps demonstrate that addition of object-level groundings leads to a better understanding of the images and locates the attended objects more precisely."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huang-etal-2019-multi-grained">
<titleInfo>
<title>Multi-grained Attention with Object-level Grounding for Visual Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pingping</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianhui</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuqing</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min</namePart>
<namePart type="family">Qiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yong</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Korhonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Traum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluí</namePart>
<namePart type="given">s</namePart>
<namePart type="family">Màrquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Attention mechanisms are widely used in Visual Question Answering (VQA) to search for visual clues related to the question. Most approaches train attention models from a coarse-grained association between sentences and images, which tends to fail on small objects or uncommon concepts. To address this problem, this paper proposes a multi-grained attention method. It learns explicit word-object correspondence by two types of word-level attention complementary to the sentence-image association. Evaluated on the VQA benchmark, the multi-grained attention model achieves competitive performance with state-of-the-art models. And the visualized attention maps demonstrate that addition of object-level groundings leads to a better understanding of the images and locates the attended objects more precisely.</abstract>
<identifier type="citekey">huang-etal-2019-multi-grained</identifier>
<identifier type="doi">10.18653/v1/P19-1349</identifier>
<location>
<url>https://aclanthology.org/P19-1349/</url>
</location>
<part>
<date>2019-07</date>
<extent unit="page">
<start>3595</start>
<end>3600</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multi-grained Attention with Object-level Grounding for Visual Question Answering
%A Huang, Pingping
%A Huang, Jianhui
%A Guo, Yuqing
%A Qiao, Min
%A Zhu, Yong
%Y Korhonen, Anna
%Y Traum, David
%Y Màrquez, Lluí s.
%S Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics
%D 2019
%8 July
%I Association for Computational Linguistics
%C Florence, Italy
%F huang-etal-2019-multi-grained
%X Attention mechanisms are widely used in Visual Question Answering (VQA) to search for visual clues related to the question. Most approaches train attention models from a coarse-grained association between sentences and images, which tends to fail on small objects or uncommon concepts. To address this problem, this paper proposes a multi-grained attention method. It learns explicit word-object correspondence by two types of word-level attention complementary to the sentence-image association. Evaluated on the VQA benchmark, the multi-grained attention model achieves competitive performance with state-of-the-art models. And the visualized attention maps demonstrate that addition of object-level groundings leads to a better understanding of the images and locates the attended objects more precisely.
%R 10.18653/v1/P19-1349
%U https://aclanthology.org/P19-1349/
%U https://doi.org/10.18653/v1/P19-1349
%P 3595-3600
Markdown (Informal)
[Multi-grained Attention with Object-level Grounding for Visual Question Answering](https://aclanthology.org/P19-1349/) (Huang et al., ACL 2019)
ACL