@inproceedings{pitta-etal-2025-probing,
title = "Probing Vision-Language Understanding through the Visual Entailment Task: promises and pitfalls",
author = "Pitta, Elena and
Kouwenhoven, Tom and
Verhoef, Tessa",
editor = "Cardoso, Henrique Lopes and
Sousa-Silva, Rui and
Koponen, Maarit and
Pareja-Lora, Antonio",
booktitle = "Proceedings of the 2nd LUHME Workshop",
month = oct,
year = "2025",
address = "Bologna, Italy",
publisher = "LUHME",
url = "https://aclanthology.org/2025.luhme-1.8/",
pages = "74--83",
abstract = "This study investigates the extent to which the Visual Entailment (VE) task serves as a reliable probe of vision-language understanding in multimodal language models, using the LLaMA 3.2 11B Vision model as a test case. Beyond reporting performance metrics, we aim to interpret what these results reveal about the underlying possibilities and limitations of the VE task. We conduct a series of experiments across zero-shot, few-shot, and fine-tuning settings, exploring how factors such as prompt design, the number and order of in-context examples and access to visual information might affect VE performance. To further probe the reasoning processes of the model, we used explanation-based evaluations. Results indicate that three-shot inference outperforms the zero-shot baselines. However, additional examples introduce more noise than they provide benefits. Additionally, the order of the labels in the prompt is a critical factor that influences the predictions. In the absence of visual information, the model has a strong tendency to hallucinate and imagine content, raising questions about the model{'}s over-reliance on linguistic priors. Fine-tuning yields strong results, achieving an accuracy of 83.3{\%} on the e-SNLI-VE dataset and outperforming the state-of-the-art OFA-X model. Additionally, the explanation evaluation demonstrates that the fine-tuned model provides semantically meaningful explanations similar to those of humans, with a BERTScore F1-score of 89.2{\%}. We do, however, find comparable BERTScore results in experiments with limited vision, questioning the visual grounding of this task. Overall, our results highlight both the utility and limitations of VE as a diagnostic task for vision-language understanding and point to directions for refining multimodal evaluation methods."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pitta-etal-2025-probing">
<titleInfo>
<title>Probing Vision-Language Understanding through the Visual Entailment Task: promises and pitfalls</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Pitta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kouwenhoven</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tessa</namePart>
<namePart type="family">Verhoef</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd LUHME Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Henrique</namePart>
<namePart type="given">Lopes</namePart>
<namePart type="family">Cardoso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Sousa-Silva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maarit</namePart>
<namePart type="family">Koponen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Pareja-Lora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>LUHME</publisher>
<place>
<placeTerm type="text">Bologna, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This study investigates the extent to which the Visual Entailment (VE) task serves as a reliable probe of vision-language understanding in multimodal language models, using the LLaMA 3.2 11B Vision model as a test case. Beyond reporting performance metrics, we aim to interpret what these results reveal about the underlying possibilities and limitations of the VE task. We conduct a series of experiments across zero-shot, few-shot, and fine-tuning settings, exploring how factors such as prompt design, the number and order of in-context examples and access to visual information might affect VE performance. To further probe the reasoning processes of the model, we used explanation-based evaluations. Results indicate that three-shot inference outperforms the zero-shot baselines. However, additional examples introduce more noise than they provide benefits. Additionally, the order of the labels in the prompt is a critical factor that influences the predictions. In the absence of visual information, the model has a strong tendency to hallucinate and imagine content, raising questions about the model’s over-reliance on linguistic priors. Fine-tuning yields strong results, achieving an accuracy of 83.3% on the e-SNLI-VE dataset and outperforming the state-of-the-art OFA-X model. Additionally, the explanation evaluation demonstrates that the fine-tuned model provides semantically meaningful explanations similar to those of humans, with a BERTScore F1-score of 89.2%. We do, however, find comparable BERTScore results in experiments with limited vision, questioning the visual grounding of this task. Overall, our results highlight both the utility and limitations of VE as a diagnostic task for vision-language understanding and point to directions for refining multimodal evaluation methods.</abstract>
<identifier type="citekey">pitta-etal-2025-probing</identifier>
<location>
<url>https://aclanthology.org/2025.luhme-1.8/</url>
</location>
<part>
<date>2025-10</date>
<extent unit="page">
<start>74</start>
<end>83</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Probing Vision-Language Understanding through the Visual Entailment Task: promises and pitfalls
%A Pitta, Elena
%A Kouwenhoven, Tom
%A Verhoef, Tessa
%Y Cardoso, Henrique Lopes
%Y Sousa-Silva, Rui
%Y Koponen, Maarit
%Y Pareja-Lora, Antonio
%S Proceedings of the 2nd LUHME Workshop
%D 2025
%8 October
%I LUHME
%C Bologna, Italy
%F pitta-etal-2025-probing
%X This study investigates the extent to which the Visual Entailment (VE) task serves as a reliable probe of vision-language understanding in multimodal language models, using the LLaMA 3.2 11B Vision model as a test case. Beyond reporting performance metrics, we aim to interpret what these results reveal about the underlying possibilities and limitations of the VE task. We conduct a series of experiments across zero-shot, few-shot, and fine-tuning settings, exploring how factors such as prompt design, the number and order of in-context examples and access to visual information might affect VE performance. To further probe the reasoning processes of the model, we used explanation-based evaluations. Results indicate that three-shot inference outperforms the zero-shot baselines. However, additional examples introduce more noise than they provide benefits. Additionally, the order of the labels in the prompt is a critical factor that influences the predictions. In the absence of visual information, the model has a strong tendency to hallucinate and imagine content, raising questions about the model’s over-reliance on linguistic priors. Fine-tuning yields strong results, achieving an accuracy of 83.3% on the e-SNLI-VE dataset and outperforming the state-of-the-art OFA-X model. Additionally, the explanation evaluation demonstrates that the fine-tuned model provides semantically meaningful explanations similar to those of humans, with a BERTScore F1-score of 89.2%. We do, however, find comparable BERTScore results in experiments with limited vision, questioning the visual grounding of this task. Overall, our results highlight both the utility and limitations of VE as a diagnostic task for vision-language understanding and point to directions for refining multimodal evaluation methods.
%U https://aclanthology.org/2025.luhme-1.8/
%P 74-83
Markdown (Informal)
[Probing Vision-Language Understanding through the Visual Entailment Task: promises and pitfalls](https://aclanthology.org/2025.luhme-1.8/) (Pitta et al., LUHME 2025)
ACL