@inproceedings{monajatipoor-etal-2023-metavl,
title = "{M}eta{VL}: Transferring In-Context Learning Ability From Language Models to Vision-Language Models",
author = "Monajatipoor, Masoud and
Li, Liunian Harold and
Rouhsedaghat, Mozhdeh and
Yang, Lin and
Chang, Kai-Wei",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-short.43",
doi = "10.18653/v1/2023.acl-short.43",
pages = "495--508",
abstract = "Large-scale language models have shown the ability to adapt to a new task via conditioning on a few demonstrations (i.e., in-context learning). However, in the vision-language domain, most large-scale pre-trained vision-language (VL) models do not possess the ability to conduct in-context learning. How can we enable in-context learning for VL models? In this paper, we study an interesting hypothesis: can we transfer the in-context learning ability from the language domain to the VL domain? Specifically, we first meta-trains a language model to perform in-context learning on NLP tasks (as in MetaICL); then we transfer this model to perform VL tasks by attaching a visual encoder. Our experiments suggest that indeed in-context learning ability can be transferred cross modalities: our model considerably improves the in-context learning capability on VL tasks and can even compensate for the size of the model significantly. On VQA, OK-VQA, and GQA, our method could outperform the baseline model while having {\textasciitilde}20 times fewer parameters.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="monajatipoor-etal-2023-metavl">
<titleInfo>
<title>MetaVL: Transferring In-Context Learning Ability From Language Models to Vision-Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Masoud</namePart>
<namePart type="family">Monajatipoor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liunian</namePart>
<namePart type="given">Harold</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mozhdeh</namePart>
<namePart type="family">Rouhsedaghat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lin</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large-scale language models have shown the ability to adapt to a new task via conditioning on a few demonstrations (i.e., in-context learning). However, in the vision-language domain, most large-scale pre-trained vision-language (VL) models do not possess the ability to conduct in-context learning. How can we enable in-context learning for VL models? In this paper, we study an interesting hypothesis: can we transfer the in-context learning ability from the language domain to the VL domain? Specifically, we first meta-trains a language model to perform in-context learning on NLP tasks (as in MetaICL); then we transfer this model to perform VL tasks by attaching a visual encoder. Our experiments suggest that indeed in-context learning ability can be transferred cross modalities: our model considerably improves the in-context learning capability on VL tasks and can even compensate for the size of the model significantly. On VQA, OK-VQA, and GQA, our method could outperform the baseline model while having ~20 times fewer parameters.</abstract>
<identifier type="citekey">monajatipoor-etal-2023-metavl</identifier>
<identifier type="doi">10.18653/v1/2023.acl-short.43</identifier>
<location>
<url>https://aclanthology.org/2023.acl-short.43</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>495</start>
<end>508</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MetaVL: Transferring In-Context Learning Ability From Language Models to Vision-Language Models
%A Monajatipoor, Masoud
%A Li, Liunian Harold
%A Rouhsedaghat, Mozhdeh
%A Yang, Lin
%A Chang, Kai-Wei
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F monajatipoor-etal-2023-metavl
%X Large-scale language models have shown the ability to adapt to a new task via conditioning on a few demonstrations (i.e., in-context learning). However, in the vision-language domain, most large-scale pre-trained vision-language (VL) models do not possess the ability to conduct in-context learning. How can we enable in-context learning for VL models? In this paper, we study an interesting hypothesis: can we transfer the in-context learning ability from the language domain to the VL domain? Specifically, we first meta-trains a language model to perform in-context learning on NLP tasks (as in MetaICL); then we transfer this model to perform VL tasks by attaching a visual encoder. Our experiments suggest that indeed in-context learning ability can be transferred cross modalities: our model considerably improves the in-context learning capability on VL tasks and can even compensate for the size of the model significantly. On VQA, OK-VQA, and GQA, our method could outperform the baseline model while having ~20 times fewer parameters.
%R 10.18653/v1/2023.acl-short.43
%U https://aclanthology.org/2023.acl-short.43
%U https://doi.org/10.18653/v1/2023.acl-short.43
%P 495-508
Markdown (Informal)
[MetaVL: Transferring In-Context Learning Ability From Language Models to Vision-Language Models](https://aclanthology.org/2023.acl-short.43) (Monajatipoor et al., ACL 2023)
ACL