@inproceedings{voloshina-etal-2023-language,
title = "Are Language-and-Vision Transformers Sensitive to Discourse? A Case Study of {V}i{LBERT}",
author = "Voloshina, Ekaterina and
Ilinykh, Nikolai and
Dobnik, Simon",
editor = "Gatt, Albert and
Gardent, Claire and
Cripwell, Liam and
Belz, Anya and
Borg, Claudia and
Erdem, Aykut and
Erdem, Erkut",
booktitle = "Proceedings of the Workshop on Multimodal, Multilingual Natural Language Generation and Multilingual WebNLG Challenge (MM-NLG 2023)",
month = sep,
year = "2023",
address = "Prague, Czech Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.mmnlg-1.4",
pages = "28--38",
abstract = "Language-and-vision models have shown good performance in tasks such as image-caption matching and caption generation. However, it is challenging for such models to generate pragmatically correct captions, which adequately reflect what is happening in one image or several images. It is crucial to evaluate this behaviour to understand underlying reasons behind it. Here we explore to what extent contextual language-and-vision models are sensitive to different discourse, both textual and visual. In particular, we employ one of the multi-modal transformers (ViLBERT) and test if it can match descriptions and images, differentiating them from distractors of different degree of similarity that are sampled from different visual and textual contexts. We place our evaluation in the multi-sentence and multi-image setup, where images and sentences are expected to form a single narrative structure. We show that the model can distinguish different situations but it is not sensitive to differences within one narrative structure. We also show that performance depends on the task itself, for example, what modality remains unchanged in non-matching pairs or how similar non-matching pairs are to original pairs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="voloshina-etal-2023-language">
<titleInfo>
<title>Are Language-and-Vision Transformers Sensitive to Discourse? A Case Study of ViLBERT</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Voloshina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolai</namePart>
<namePart type="family">Ilinykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Dobnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Multimodal, Multilingual Natural Language Generation and Multilingual WebNLG Challenge (MM-NLG 2023)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Albert</namePart>
<namePart type="family">Gatt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claire</namePart>
<namePart type="family">Gardent</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liam</namePart>
<namePart type="family">Cripwell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Borg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aykut</namePart>
<namePart type="family">Erdem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erkut</namePart>
<namePart type="family">Erdem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Prague, Czech Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language-and-vision models have shown good performance in tasks such as image-caption matching and caption generation. However, it is challenging for such models to generate pragmatically correct captions, which adequately reflect what is happening in one image or several images. It is crucial to evaluate this behaviour to understand underlying reasons behind it. Here we explore to what extent contextual language-and-vision models are sensitive to different discourse, both textual and visual. In particular, we employ one of the multi-modal transformers (ViLBERT) and test if it can match descriptions and images, differentiating them from distractors of different degree of similarity that are sampled from different visual and textual contexts. We place our evaluation in the multi-sentence and multi-image setup, where images and sentences are expected to form a single narrative structure. We show that the model can distinguish different situations but it is not sensitive to differences within one narrative structure. We also show that performance depends on the task itself, for example, what modality remains unchanged in non-matching pairs or how similar non-matching pairs are to original pairs.</abstract>
<identifier type="citekey">voloshina-etal-2023-language</identifier>
<location>
<url>https://aclanthology.org/2023.mmnlg-1.4</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>28</start>
<end>38</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Are Language-and-Vision Transformers Sensitive to Discourse? A Case Study of ViLBERT
%A Voloshina, Ekaterina
%A Ilinykh, Nikolai
%A Dobnik, Simon
%Y Gatt, Albert
%Y Gardent, Claire
%Y Cripwell, Liam
%Y Belz, Anya
%Y Borg, Claudia
%Y Erdem, Aykut
%Y Erdem, Erkut
%S Proceedings of the Workshop on Multimodal, Multilingual Natural Language Generation and Multilingual WebNLG Challenge (MM-NLG 2023)
%D 2023
%8 September
%I Association for Computational Linguistics
%C Prague, Czech Republic
%F voloshina-etal-2023-language
%X Language-and-vision models have shown good performance in tasks such as image-caption matching and caption generation. However, it is challenging for such models to generate pragmatically correct captions, which adequately reflect what is happening in one image or several images. It is crucial to evaluate this behaviour to understand underlying reasons behind it. Here we explore to what extent contextual language-and-vision models are sensitive to different discourse, both textual and visual. In particular, we employ one of the multi-modal transformers (ViLBERT) and test if it can match descriptions and images, differentiating them from distractors of different degree of similarity that are sampled from different visual and textual contexts. We place our evaluation in the multi-sentence and multi-image setup, where images and sentences are expected to form a single narrative structure. We show that the model can distinguish different situations but it is not sensitive to differences within one narrative structure. We also show that performance depends on the task itself, for example, what modality remains unchanged in non-matching pairs or how similar non-matching pairs are to original pairs.
%U https://aclanthology.org/2023.mmnlg-1.4
%P 28-38
Markdown (Informal)
[Are Language-and-Vision Transformers Sensitive to Discourse? A Case Study of ViLBERT](https://aclanthology.org/2023.mmnlg-1.4) (Voloshina et al., MMNLG-WS 2023)
ACL