@inproceedings{tiong-etal-2022-plug,
title = "Plug-and-Play {VQA}: Zero-shot {VQA} by Conjoining Large Pretrained Models with Zero Training",
author = "Tiong, Anthony Meng Huat and
Li, Junnan and
Li, Boyang and
Savarese, Silvio and
Hoi, Steven C.H.",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.67",
doi = "10.18653/v1/2022.findings-emnlp.67",
pages = "951--967",
abstract = "Visual question answering (VQA) is a hallmark of vision and language reasoningand a challenging task under the zero-shot setting.We propose Plug-and-Play VQA (PNP-VQA),a modular framework for zero-shot VQA.In contrast to most existing works, which require substantial adaptation of pretrained language models (PLMs) for the vision modality,PNP-VQA requires no additional training of the PLMs.Instead, we propose to use natural language and network interpretation as an intermediate representation that glues pretrained models together. We first generate question-guided informative image captions,and pass the captions to a PLM as context for question answering.Surpassing end-to-end trained baselines, PNP-VQA achieves state-of-the-art results on zero-shot VQAv2 and GQA. With 11B parameters, it outperforms the 80B-parameter Flamingo model by 8.5{\%} on VQAv2. With 738M PLM parameters, PNP-VQA achieves an improvement of 9.1{\%} on GQA over FewVLM with 740M PLM parameters.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tiong-etal-2022-plug">
<titleInfo>
<title>Plug-and-Play VQA: Zero-shot VQA by Conjoining Large Pretrained Models with Zero Training</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anthony</namePart>
<namePart type="given">Meng</namePart>
<namePart type="given">Huat</namePart>
<namePart type="family">Tiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junnan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Boyang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Silvio</namePart>
<namePart type="family">Savarese</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="given">C.H.</namePart>
<namePart type="family">Hoi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Visual question answering (VQA) is a hallmark of vision and language reasoningand a challenging task under the zero-shot setting.We propose Plug-and-Play VQA (PNP-VQA),a modular framework for zero-shot VQA.In contrast to most existing works, which require substantial adaptation of pretrained language models (PLMs) for the vision modality,PNP-VQA requires no additional training of the PLMs.Instead, we propose to use natural language and network interpretation as an intermediate representation that glues pretrained models together. We first generate question-guided informative image captions,and pass the captions to a PLM as context for question answering.Surpassing end-to-end trained baselines, PNP-VQA achieves state-of-the-art results on zero-shot VQAv2 and GQA. With 11B parameters, it outperforms the 80B-parameter Flamingo model by 8.5% on VQAv2. With 738M PLM parameters, PNP-VQA achieves an improvement of 9.1% on GQA over FewVLM with 740M PLM parameters.</abstract>
<identifier type="citekey">tiong-etal-2022-plug</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.67</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.67</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>951</start>
<end>967</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Plug-and-Play VQA: Zero-shot VQA by Conjoining Large Pretrained Models with Zero Training
%A Tiong, Anthony Meng Huat
%A Li, Junnan
%A Li, Boyang
%A Savarese, Silvio
%A Hoi, Steven C.H.
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F tiong-etal-2022-plug
%X Visual question answering (VQA) is a hallmark of vision and language reasoningand a challenging task under the zero-shot setting.We propose Plug-and-Play VQA (PNP-VQA),a modular framework for zero-shot VQA.In contrast to most existing works, which require substantial adaptation of pretrained language models (PLMs) for the vision modality,PNP-VQA requires no additional training of the PLMs.Instead, we propose to use natural language and network interpretation as an intermediate representation that glues pretrained models together. We first generate question-guided informative image captions,and pass the captions to a PLM as context for question answering.Surpassing end-to-end trained baselines, PNP-VQA achieves state-of-the-art results on zero-shot VQAv2 and GQA. With 11B parameters, it outperforms the 80B-parameter Flamingo model by 8.5% on VQAv2. With 738M PLM parameters, PNP-VQA achieves an improvement of 9.1% on GQA over FewVLM with 740M PLM parameters.
%R 10.18653/v1/2022.findings-emnlp.67
%U https://aclanthology.org/2022.findings-emnlp.67
%U https://doi.org/10.18653/v1/2022.findings-emnlp.67
%P 951-967
Markdown (Informal)
[Plug-and-Play VQA: Zero-shot VQA by Conjoining Large Pretrained Models with Zero Training](https://aclanthology.org/2022.findings-emnlp.67) (Tiong et al., Findings 2022)
ACL