@inproceedings{shimizu-etal-2018-visual,
title = "Visual Question Answering Dataset for Bilingual Image Understanding: A Study of Cross-Lingual Transfer Using Attention Maps",
author = "Shimizu, Nobuyuki and
Rong, Na and
Miyazaki, Takashi",
editor = "Bender, Emily M. and
Derczynski, Leon and
Isabelle, Pierre",
booktitle = "Proceedings of the 27th International Conference on Computational Linguistics",
month = aug,
year = "2018",
address = "Santa Fe, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/C18-1163",
pages = "1918--1928",
abstract = "Visual question answering (VQA) is a challenging task that requires a computer system to understand both a question and an image. While there is much research on VQA in English, there is a lack of datasets for other languages, and English annotation is not directly applicable in those languages. To deal with this, we have created a Japanese VQA dataset by using crowdsourced annotation with images from the Visual Genome dataset. This is the first such dataset in Japanese. As another contribution, we propose a cross-lingual method for making use of English annotation to improve a Japanese VQA system. The proposed method is based on a popular VQA method that uses an attention mechanism. We use attention maps generated from English questions to help improve the Japanese VQA task. The proposed method experimentally performed better than simply using a monolingual corpus, which demonstrates the effectiveness of using attention maps to transfer cross-lingual information.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shimizu-etal-2018-visual">
<titleInfo>
<title>Visual Question Answering Dataset for Bilingual Image Understanding: A Study of Cross-Lingual Transfer Using Attention Maps</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nobuyuki</namePart>
<namePart type="family">Shimizu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Na</namePart>
<namePart type="family">Rong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Takashi</namePart>
<namePart type="family">Miyazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 27th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Bender</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leon</namePart>
<namePart type="family">Derczynski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Isabelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Santa Fe, New Mexico, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Visual question answering (VQA) is a challenging task that requires a computer system to understand both a question and an image. While there is much research on VQA in English, there is a lack of datasets for other languages, and English annotation is not directly applicable in those languages. To deal with this, we have created a Japanese VQA dataset by using crowdsourced annotation with images from the Visual Genome dataset. This is the first such dataset in Japanese. As another contribution, we propose a cross-lingual method for making use of English annotation to improve a Japanese VQA system. The proposed method is based on a popular VQA method that uses an attention mechanism. We use attention maps generated from English questions to help improve the Japanese VQA task. The proposed method experimentally performed better than simply using a monolingual corpus, which demonstrates the effectiveness of using attention maps to transfer cross-lingual information.</abstract>
<identifier type="citekey">shimizu-etal-2018-visual</identifier>
<location>
<url>https://aclanthology.org/C18-1163</url>
</location>
<part>
<date>2018-08</date>
<extent unit="page">
<start>1918</start>
<end>1928</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Visual Question Answering Dataset for Bilingual Image Understanding: A Study of Cross-Lingual Transfer Using Attention Maps
%A Shimizu, Nobuyuki
%A Rong, Na
%A Miyazaki, Takashi
%Y Bender, Emily M.
%Y Derczynski, Leon
%Y Isabelle, Pierre
%S Proceedings of the 27th International Conference on Computational Linguistics
%D 2018
%8 August
%I Association for Computational Linguistics
%C Santa Fe, New Mexico, USA
%F shimizu-etal-2018-visual
%X Visual question answering (VQA) is a challenging task that requires a computer system to understand both a question and an image. While there is much research on VQA in English, there is a lack of datasets for other languages, and English annotation is not directly applicable in those languages. To deal with this, we have created a Japanese VQA dataset by using crowdsourced annotation with images from the Visual Genome dataset. This is the first such dataset in Japanese. As another contribution, we propose a cross-lingual method for making use of English annotation to improve a Japanese VQA system. The proposed method is based on a popular VQA method that uses an attention mechanism. We use attention maps generated from English questions to help improve the Japanese VQA task. The proposed method experimentally performed better than simply using a monolingual corpus, which demonstrates the effectiveness of using attention maps to transfer cross-lingual information.
%U https://aclanthology.org/C18-1163
%P 1918-1928
Markdown (Informal)
[Visual Question Answering Dataset for Bilingual Image Understanding: A Study of Cross-Lingual Transfer Using Attention Maps](https://aclanthology.org/C18-1163) (Shimizu et al., COLING 2018)
ACL