@inproceedings{parfenova-etal-2021-probing,
title = "Probing Cross-Modal Representations in Multi-Step Relational Reasoning",
author = "Parfenova, Iuliia and
Elliott, Desmond and
Fern{\'a}ndez, Raquel and
Pezzelle, Sandro",
editor = "Rogers, Anna and
Calixto, Iacer and
Vuli{\'c}, Ivan and
Saphra, Naomi and
Kassner, Nora and
Camburu, Oana-Maria and
Bansal, Trapit and
Shwartz, Vered",
booktitle = "Proceedings of the 6th Workshop on Representation Learning for NLP (RepL4NLP-2021)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.repl4nlp-1.16",
doi = "10.18653/v1/2021.repl4nlp-1.16",
pages = "152--162",
abstract = "We investigate the representations learned by vision and language models in tasks that require relational reasoning. Focusing on the problem of assessing the relative size of objects in abstract visual contexts, we analyse both one-step and two-step reasoning. For the latter, we construct a new dataset of three-image scenes and define a task that requires reasoning at the level of the individual images and across images in a scene. We probe the learned model representations using diagnostic classifiers. Our experiments show that pretrained multimodal transformer-based architectures can perform higher-level relational reasoning, and are able to learn representations for novel tasks and data that are very different from what was seen in pretraining.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="parfenova-etal-2021-probing">
<titleInfo>
<title>Probing Cross-Modal Representations in Multi-Step Relational Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Iuliia</namePart>
<namePart type="family">Parfenova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Desmond</namePart>
<namePart type="family">Elliott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raquel</namePart>
<namePart type="family">Fernández</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandro</namePart>
<namePart type="family">Pezzelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th Workshop on Representation Learning for NLP (RepL4NLP-2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iacer</namePart>
<namePart type="family">Calixto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Vulić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naomi</namePart>
<namePart type="family">Saphra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nora</namePart>
<namePart type="family">Kassner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oana-Maria</namePart>
<namePart type="family">Camburu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trapit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vered</namePart>
<namePart type="family">Shwartz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We investigate the representations learned by vision and language models in tasks that require relational reasoning. Focusing on the problem of assessing the relative size of objects in abstract visual contexts, we analyse both one-step and two-step reasoning. For the latter, we construct a new dataset of three-image scenes and define a task that requires reasoning at the level of the individual images and across images in a scene. We probe the learned model representations using diagnostic classifiers. Our experiments show that pretrained multimodal transformer-based architectures can perform higher-level relational reasoning, and are able to learn representations for novel tasks and data that are very different from what was seen in pretraining.</abstract>
<identifier type="citekey">parfenova-etal-2021-probing</identifier>
<identifier type="doi">10.18653/v1/2021.repl4nlp-1.16</identifier>
<location>
<url>https://aclanthology.org/2021.repl4nlp-1.16</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>152</start>
<end>162</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Probing Cross-Modal Representations in Multi-Step Relational Reasoning
%A Parfenova, Iuliia
%A Elliott, Desmond
%A Fernández, Raquel
%A Pezzelle, Sandro
%Y Rogers, Anna
%Y Calixto, Iacer
%Y Vulić, Ivan
%Y Saphra, Naomi
%Y Kassner, Nora
%Y Camburu, Oana-Maria
%Y Bansal, Trapit
%Y Shwartz, Vered
%S Proceedings of the 6th Workshop on Representation Learning for NLP (RepL4NLP-2021)
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F parfenova-etal-2021-probing
%X We investigate the representations learned by vision and language models in tasks that require relational reasoning. Focusing on the problem of assessing the relative size of objects in abstract visual contexts, we analyse both one-step and two-step reasoning. For the latter, we construct a new dataset of three-image scenes and define a task that requires reasoning at the level of the individual images and across images in a scene. We probe the learned model representations using diagnostic classifiers. Our experiments show that pretrained multimodal transformer-based architectures can perform higher-level relational reasoning, and are able to learn representations for novel tasks and data that are very different from what was seen in pretraining.
%R 10.18653/v1/2021.repl4nlp-1.16
%U https://aclanthology.org/2021.repl4nlp-1.16
%U https://doi.org/10.18653/v1/2021.repl4nlp-1.16
%P 152-162
Markdown (Informal)
[Probing Cross-Modal Representations in Multi-Step Relational Reasoning](https://aclanthology.org/2021.repl4nlp-1.16) (Parfenova et al., RepL4NLP 2021)
ACL