@inproceedings{ziaeefard-lecue-2020-towards,
title = "Towards Knowledge-Augmented Visual Question Answering",
author = "Ziaeefard, Maryam and
Lecue, Freddy",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.169",
doi = "10.18653/v1/2020.coling-main.169",
pages = "1863--1873",
abstract = "Visual Question Answering (VQA) remains algorithmically challenging while it is effortless for humans. Humans combine visual observations with general and commonsense knowledge to answer questions about a given image. In this paper, we address the problem of incorporating general knowledge into VQA models while leveraging the visual information. We propose a model that captures the interactions between objects in a visual scene and entities in an external knowledge source. Our model is a graph-based approach that combines scene graphs with concept graphs, which learns a question-adaptive graph representation of related knowledge instances. We use Graph Attention Networks to set higher importance to key knowledge instances that are mostly relevant to each question. We exploit ConceptNet as the source of general knowledge and evaluate the performance of our model on the challenging OK-VQA dataset.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ziaeefard-lecue-2020-towards">
<titleInfo>
<title>Towards Knowledge-Augmented Visual Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maryam</namePart>
<namePart type="family">Ziaeefard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Freddy</namePart>
<namePart type="family">Lecue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donia</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Visual Question Answering (VQA) remains algorithmically challenging while it is effortless for humans. Humans combine visual observations with general and commonsense knowledge to answer questions about a given image. In this paper, we address the problem of incorporating general knowledge into VQA models while leveraging the visual information. We propose a model that captures the interactions between objects in a visual scene and entities in an external knowledge source. Our model is a graph-based approach that combines scene graphs with concept graphs, which learns a question-adaptive graph representation of related knowledge instances. We use Graph Attention Networks to set higher importance to key knowledge instances that are mostly relevant to each question. We exploit ConceptNet as the source of general knowledge and evaluate the performance of our model on the challenging OK-VQA dataset.</abstract>
<identifier type="citekey">ziaeefard-lecue-2020-towards</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.169</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.169</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>1863</start>
<end>1873</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Knowledge-Augmented Visual Question Answering
%A Ziaeefard, Maryam
%A Lecue, Freddy
%Y Scott, Donia
%Y Bel, Nuria
%Y Zong, Chengqing
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F ziaeefard-lecue-2020-towards
%X Visual Question Answering (VQA) remains algorithmically challenging while it is effortless for humans. Humans combine visual observations with general and commonsense knowledge to answer questions about a given image. In this paper, we address the problem of incorporating general knowledge into VQA models while leveraging the visual information. We propose a model that captures the interactions between objects in a visual scene and entities in an external knowledge source. Our model is a graph-based approach that combines scene graphs with concept graphs, which learns a question-adaptive graph representation of related knowledge instances. We use Graph Attention Networks to set higher importance to key knowledge instances that are mostly relevant to each question. We exploit ConceptNet as the source of general knowledge and evaluate the performance of our model on the challenging OK-VQA dataset.
%R 10.18653/v1/2020.coling-main.169
%U https://aclanthology.org/2020.coling-main.169
%U https://doi.org/10.18653/v1/2020.coling-main.169
%P 1863-1873
Markdown (Informal)
[Towards Knowledge-Augmented Visual Question Answering](https://aclanthology.org/2020.coling-main.169) (Ziaeefard & Lecue, COLING 2020)
ACL
- Maryam Ziaeefard and Freddy Lecue. 2020. Towards Knowledge-Augmented Visual Question Answering. In Proceedings of the 28th International Conference on Computational Linguistics, pages 1863–1873, Barcelona, Spain (Online). International Committee on Computational Linguistics.