@inproceedings{parcalabescu-etal-2021-seeing,
title = "Seeing past words: Testing the cross-modal capabilities of pretrained {V}{\&}{L} models on counting tasks",
author = "Parcalabescu, Letitia and
Gatt, Albert and
Frank, Anette and
Calixto, Iacer",
editor = "Donatelli, Lucia and
Krishnaswamy, Nikhil and
Lai, Kenneth and
Pustejovsky, James",
booktitle = "Proceedings of the 1st Workshop on Multimodal Semantic Representations (MMSR)",
month = jun,
year = "2021",
address = "Groningen, Netherlands (Online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.mmsr-1.4",
pages = "32--44",
abstract = "We investigate the reasoning ability of pretrained vision and language (V{\&}L) models in two tasks that require multimodal integration: (1) discriminating a correct image-sentence pair from an incorrect one, and (2) counting entities in an image. We evaluate three pretrained V{\&}L models on these tasks: ViLBERT, ViLBERT 12-in-1 and LXMERT, in zero-shot and finetuned settings. Our results show that models solve task (1) very well, as expected, since all models are pretrained on task (1). However, none of the pretrained V{\&}L models is able to adequately solve task (2), our counting probe, and they cannot generalise to out-of-distribution quantities. We propose a number of explanations for these findings: LXMERT (and to some extent ViLBERT 12-in-1) show some evidence of catastrophic forgetting on task (1). Concerning our results on the counting probe, we find evidence that all models are impacted by dataset bias, and also fail to individuate entities in the visual input. While a selling point of pretrained V{\&}L models is their ability to solve complex tasks, our findings suggest that understanding their reasoning and grounding capabilities requires more targeted investigations on specific phenomena.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="parcalabescu-etal-2021-seeing">
<titleInfo>
<title>Seeing past words: Testing the cross-modal capabilities of pretrained V&L models on counting tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Letitia</namePart>
<namePart type="family">Parcalabescu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Albert</namePart>
<namePart type="family">Gatt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anette</namePart>
<namePart type="family">Frank</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iacer</namePart>
<namePart type="family">Calixto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Multimodal Semantic Representations (MMSR)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Donatelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikhil</namePart>
<namePart type="family">Krishnaswamy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenneth</namePart>
<namePart type="family">Lai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Pustejovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Groningen, Netherlands (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We investigate the reasoning ability of pretrained vision and language (V&L) models in two tasks that require multimodal integration: (1) discriminating a correct image-sentence pair from an incorrect one, and (2) counting entities in an image. We evaluate three pretrained V&L models on these tasks: ViLBERT, ViLBERT 12-in-1 and LXMERT, in zero-shot and finetuned settings. Our results show that models solve task (1) very well, as expected, since all models are pretrained on task (1). However, none of the pretrained V&L models is able to adequately solve task (2), our counting probe, and they cannot generalise to out-of-distribution quantities. We propose a number of explanations for these findings: LXMERT (and to some extent ViLBERT 12-in-1) show some evidence of catastrophic forgetting on task (1). Concerning our results on the counting probe, we find evidence that all models are impacted by dataset bias, and also fail to individuate entities in the visual input. While a selling point of pretrained V&L models is their ability to solve complex tasks, our findings suggest that understanding their reasoning and grounding capabilities requires more targeted investigations on specific phenomena.</abstract>
<identifier type="citekey">parcalabescu-etal-2021-seeing</identifier>
<location>
<url>https://aclanthology.org/2021.mmsr-1.4</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>32</start>
<end>44</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Seeing past words: Testing the cross-modal capabilities of pretrained V&L models on counting tasks
%A Parcalabescu, Letitia
%A Gatt, Albert
%A Frank, Anette
%A Calixto, Iacer
%Y Donatelli, Lucia
%Y Krishnaswamy, Nikhil
%Y Lai, Kenneth
%Y Pustejovsky, James
%S Proceedings of the 1st Workshop on Multimodal Semantic Representations (MMSR)
%D 2021
%8 June
%I Association for Computational Linguistics
%C Groningen, Netherlands (Online)
%F parcalabescu-etal-2021-seeing
%X We investigate the reasoning ability of pretrained vision and language (V&L) models in two tasks that require multimodal integration: (1) discriminating a correct image-sentence pair from an incorrect one, and (2) counting entities in an image. We evaluate three pretrained V&L models on these tasks: ViLBERT, ViLBERT 12-in-1 and LXMERT, in zero-shot and finetuned settings. Our results show that models solve task (1) very well, as expected, since all models are pretrained on task (1). However, none of the pretrained V&L models is able to adequately solve task (2), our counting probe, and they cannot generalise to out-of-distribution quantities. We propose a number of explanations for these findings: LXMERT (and to some extent ViLBERT 12-in-1) show some evidence of catastrophic forgetting on task (1). Concerning our results on the counting probe, we find evidence that all models are impacted by dataset bias, and also fail to individuate entities in the visual input. While a selling point of pretrained V&L models is their ability to solve complex tasks, our findings suggest that understanding their reasoning and grounding capabilities requires more targeted investigations on specific phenomena.
%U https://aclanthology.org/2021.mmsr-1.4
%P 32-44
Markdown (Informal)
[Seeing past words: Testing the cross-modal capabilities of pretrained V&L models on counting tasks](https://aclanthology.org/2021.mmsr-1.4) (Parcalabescu et al., MMSR 2021)
ACL