@inproceedings{bai-pezzelle-2025-feel,
title = "If {I} feel smart, {I} will do the right thing: Combining Complementary Multimodal Information in Visual Language Models",
author = "Bai, Yuyu and
Pezzelle, Sandro",
editor = "Zhang, Wei Emma and
Dai, Xiang and
Elliot, Desmond and
Fang, Byron and
Sim, Mongyuan and
Zhuang, Haojie and
Chen, Weitong",
booktitle = "Proceedings of the First Workshop of Evaluation of Multi-Modal Generation",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.evalmg-1.3/",
pages = "24--39",
abstract = "Generative visual language models (VLMs) have recently shown potential across various downstream language-and-vision tasks. At the same time, it is still an open question whether, and to what extent, these models can properly understand a multimodal context where language and vision provide complementary information{---}a mechanism routinely in place in human language communication. In this work, we test various VLMs on the task of generating action descriptions consistent with both an image`s visual content and an intention or attitude (not visually grounded) conveyed by a textual prompt. Our results show that BLIP-2 is not far from human performance when the task is framed as a generative multiple-choice problem, while other models struggle. Furthermore, the actions generated by BLIP-2 in an open-ended generative setting are better than those by the competitors; indeed, human annotators judge most of them as plausible continuations for the multimodal context. Our study reveals substantial variability among VLMs in integrating complementary multimodal information, yet BLIP-2 demonstrates promising trends across most evaluations, paving the way for seamless human-computer interaction."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bai-pezzelle-2025-feel">
<titleInfo>
<title>If I feel smart, I will do the right thing: Combining Complementary Multimodal Information in Visual Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuyu</namePart>
<namePart type="family">Bai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandro</namePart>
<namePart type="family">Pezzelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop of Evaluation of Multi-Modal Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="given">Emma</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Dai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Desmond</namePart>
<namePart type="family">Elliot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Byron</namePart>
<namePart type="family">Fang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mongyuan</namePart>
<namePart type="family">Sim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haojie</namePart>
<namePart type="family">Zhuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weitong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Generative visual language models (VLMs) have recently shown potential across various downstream language-and-vision tasks. At the same time, it is still an open question whether, and to what extent, these models can properly understand a multimodal context where language and vision provide complementary information—a mechanism routinely in place in human language communication. In this work, we test various VLMs on the task of generating action descriptions consistent with both an image‘s visual content and an intention or attitude (not visually grounded) conveyed by a textual prompt. Our results show that BLIP-2 is not far from human performance when the task is framed as a generative multiple-choice problem, while other models struggle. Furthermore, the actions generated by BLIP-2 in an open-ended generative setting are better than those by the competitors; indeed, human annotators judge most of them as plausible continuations for the multimodal context. Our study reveals substantial variability among VLMs in integrating complementary multimodal information, yet BLIP-2 demonstrates promising trends across most evaluations, paving the way for seamless human-computer interaction.</abstract>
<identifier type="citekey">bai-pezzelle-2025-feel</identifier>
<location>
<url>https://aclanthology.org/2025.evalmg-1.3/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>24</start>
<end>39</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T If I feel smart, I will do the right thing: Combining Complementary Multimodal Information in Visual Language Models
%A Bai, Yuyu
%A Pezzelle, Sandro
%Y Zhang, Wei Emma
%Y Dai, Xiang
%Y Elliot, Desmond
%Y Fang, Byron
%Y Sim, Mongyuan
%Y Zhuang, Haojie
%Y Chen, Weitong
%S Proceedings of the First Workshop of Evaluation of Multi-Modal Generation
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F bai-pezzelle-2025-feel
%X Generative visual language models (VLMs) have recently shown potential across various downstream language-and-vision tasks. At the same time, it is still an open question whether, and to what extent, these models can properly understand a multimodal context where language and vision provide complementary information—a mechanism routinely in place in human language communication. In this work, we test various VLMs on the task of generating action descriptions consistent with both an image‘s visual content and an intention or attitude (not visually grounded) conveyed by a textual prompt. Our results show that BLIP-2 is not far from human performance when the task is framed as a generative multiple-choice problem, while other models struggle. Furthermore, the actions generated by BLIP-2 in an open-ended generative setting are better than those by the competitors; indeed, human annotators judge most of them as plausible continuations for the multimodal context. Our study reveals substantial variability among VLMs in integrating complementary multimodal information, yet BLIP-2 demonstrates promising trends across most evaluations, paving the way for seamless human-computer interaction.
%U https://aclanthology.org/2025.evalmg-1.3/
%P 24-39
Markdown (Informal)
[If I feel smart, I will do the right thing: Combining Complementary Multimodal Information in Visual Language Models](https://aclanthology.org/2025.evalmg-1.3/) (Bai & Pezzelle, EvalMG 2025)
ACL