@inproceedings{zhang-etal-2021-lrra,
title = "{LRRA}:A Transparent Neural-Symbolic Reasoning Framework for Real-World Visual Question Answering",
author = "Zhang, Wan and
Keming, Chen and
Yujie, Zhang and
Jinan, Xu and
Yufeng, Chen",
editor = "Li, Sheng and
Sun, Maosong and
Liu, Yang and
Wu, Hua and
Liu, Kang and
Che, Wanxiang and
He, Shizhu and
Rao, Gaoqi",
booktitle = "Proceedings of the 20th Chinese National Conference on Computational Linguistics",
month = aug,
year = "2021",
address = "Huhhot, China",
publisher = "Chinese Information Processing Society of China",
url = "https://aclanthology.org/2021.ccl-1.92",
pages = "1037--1045",
abstract = "The predominant approach of visual question answering (VQA) relies on encoding the imageand question with a {''}black box{''} neural encoder and decoding a single token into answers suchas {''}yes{''} or {''}no{''}. Despite this approach{'}s strong quantitative results it struggles to come up withhuman-readable forms of justification for the prediction process. To address this insufficiency we propose LRRA[LookReadReasoningAnswer]a transparent neural-symbolic framework forvisual question answering that solves the complicated problem in the real world step-by-steplike humans and provides human-readable form of justification at each step. Specifically LRRAlearns to first convert an image into a scene graph and parse a question into multiple reasoning instructions. It then executes the reasoning instructions one at a time by traversing the scenegraph using a recurrent neural-symbolic execution module. Finally it generates answers to the given questions and makes corresponding marks on the image. Furthermore we believe that the relations between objects in the question is of great significance for obtaining the correct answerso we create a perturbed GQA test set by removing linguistic cues (attributes and relations) in the questions to analyze which part of the question contributes more to the answer. Our experimentson the GQA dataset show that LRRA is significantly better than the existing representative model(57.12{\%} vs. 56.39{\%}). Our experiments on the perturbed GQA test set show that the relations between objects is more important for answering complicated questions than the attributes ofobjects.Keywords:Visual Question Answering Relations Between Objects Neural-Symbolic Reason-ing.",
language = "English",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2021-lrra">
<titleInfo>
<title>LRRA:A Transparent Neural-Symbolic Reasoning Framework for Real-World Visual Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Keming</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhang</namePart>
<namePart type="family">Yujie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xu</namePart>
<namePart type="family">Jinan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Yufeng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th Chinese National Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sheng</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maosong</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hua</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shizhu</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gaoqi</namePart>
<namePart type="family">Rao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Chinese Information Processing Society of China</publisher>
<place>
<placeTerm type="text">Huhhot, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The predominant approach of visual question answering (VQA) relies on encoding the imageand question with a ”black box” neural encoder and decoding a single token into answers suchas ”yes” or ”no”. Despite this approach’s strong quantitative results it struggles to come up withhuman-readable forms of justification for the prediction process. To address this insufficiency we propose LRRA[LookReadReasoningAnswer]a transparent neural-symbolic framework forvisual question answering that solves the complicated problem in the real world step-by-steplike humans and provides human-readable form of justification at each step. Specifically LRRAlearns to first convert an image into a scene graph and parse a question into multiple reasoning instructions. It then executes the reasoning instructions one at a time by traversing the scenegraph using a recurrent neural-symbolic execution module. Finally it generates answers to the given questions and makes corresponding marks on the image. Furthermore we believe that the relations between objects in the question is of great significance for obtaining the correct answerso we create a perturbed GQA test set by removing linguistic cues (attributes and relations) in the questions to analyze which part of the question contributes more to the answer. Our experimentson the GQA dataset show that LRRA is significantly better than the existing representative model(57.12% vs. 56.39%). Our experiments on the perturbed GQA test set show that the relations between objects is more important for answering complicated questions than the attributes ofobjects.Keywords:Visual Question Answering Relations Between Objects Neural-Symbolic Reason-ing.</abstract>
<identifier type="citekey">zhang-etal-2021-lrra</identifier>
<location>
<url>https://aclanthology.org/2021.ccl-1.92</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>1037</start>
<end>1045</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LRRA:A Transparent Neural-Symbolic Reasoning Framework for Real-World Visual Question Answering
%A Zhang, Wan
%A Keming, Chen
%A Yujie, Zhang
%A Jinan, Xu
%A Yufeng, Chen
%Y Li, Sheng
%Y Sun, Maosong
%Y Liu, Yang
%Y Wu, Hua
%Y Liu, Kang
%Y Che, Wanxiang
%Y He, Shizhu
%Y Rao, Gaoqi
%S Proceedings of the 20th Chinese National Conference on Computational Linguistics
%D 2021
%8 August
%I Chinese Information Processing Society of China
%C Huhhot, China
%G English
%F zhang-etal-2021-lrra
%X The predominant approach of visual question answering (VQA) relies on encoding the imageand question with a ”black box” neural encoder and decoding a single token into answers suchas ”yes” or ”no”. Despite this approach’s strong quantitative results it struggles to come up withhuman-readable forms of justification for the prediction process. To address this insufficiency we propose LRRA[LookReadReasoningAnswer]a transparent neural-symbolic framework forvisual question answering that solves the complicated problem in the real world step-by-steplike humans and provides human-readable form of justification at each step. Specifically LRRAlearns to first convert an image into a scene graph and parse a question into multiple reasoning instructions. It then executes the reasoning instructions one at a time by traversing the scenegraph using a recurrent neural-symbolic execution module. Finally it generates answers to the given questions and makes corresponding marks on the image. Furthermore we believe that the relations between objects in the question is of great significance for obtaining the correct answerso we create a perturbed GQA test set by removing linguistic cues (attributes and relations) in the questions to analyze which part of the question contributes more to the answer. Our experimentson the GQA dataset show that LRRA is significantly better than the existing representative model(57.12% vs. 56.39%). Our experiments on the perturbed GQA test set show that the relations between objects is more important for answering complicated questions than the attributes ofobjects.Keywords:Visual Question Answering Relations Between Objects Neural-Symbolic Reason-ing.
%U https://aclanthology.org/2021.ccl-1.92
%P 1037-1045
Markdown (Informal)
[LRRA:A Transparent Neural-Symbolic Reasoning Framework for Real-World Visual Question Answering](https://aclanthology.org/2021.ccl-1.92) (Zhang et al., CCL 2021)
ACL