@inproceedings{azad-etal-2026-art,
title = "The Art of Saying ``Maybe'': A Conformal Lens for Uncertainty Benchmarking in {VLM}s",
author = "Azad, Asif and
Hossain, Mohammad Sadat and
Shanto, MD Sadik Hossain and
Rahman, M Saifur and
Parvez, Md Rizwan",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.274/",
pages = "5185--5201",
ISBN = "979-8-89176-386-9",
abstract = "Vision-Language Models (VLMs) have achieved remarkable progress in complex visual understanding across scientific and reasoning tasks. While performance benchmarking has advanced our understanding of these capabilities, the critical dimension of uncertainty quantification has received insufficient attention. Therefore, unlike prior conformal prediction studies that focused on limited settings, we conduct a comprehensive uncertainty benchmarking study, evaluating 18 state-of-the-art VLMs (open and closed-source) across 6 multimodal datasets with 3 distinct scoring functions. For closed-source models lacking token-level logprob access, we develop and validate instruction-guided likelihood proxies. Our findings demonstrate that larger models consistently exhibit better uncertainty quantification; models that know more also know better what they don{'}t know. More certain models achieve higher accuracy, while mathematical and reasoning tasks elicit poorer uncertainty performance across all models compared to other domains. This work establishes a foundation for reliable uncertainty evaluation in multimodal systems."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="azad-etal-2026-art">
<titleInfo>
<title>The Art of Saying “Maybe”: A Conformal Lens for Uncertainty Benchmarking in VLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Asif</namePart>
<namePart type="family">Azad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Sadat</namePart>
<namePart type="family">Hossain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">MD</namePart>
<namePart type="given">Sadik</namePart>
<namePart type="given">Hossain</namePart>
<namePart type="family">Shanto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">M</namePart>
<namePart type="given">Saifur</namePart>
<namePart type="family">Rahman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="given">Rizwan</namePart>
<namePart type="family">Parvez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>Vision-Language Models (VLMs) have achieved remarkable progress in complex visual understanding across scientific and reasoning tasks. While performance benchmarking has advanced our understanding of these capabilities, the critical dimension of uncertainty quantification has received insufficient attention. Therefore, unlike prior conformal prediction studies that focused on limited settings, we conduct a comprehensive uncertainty benchmarking study, evaluating 18 state-of-the-art VLMs (open and closed-source) across 6 multimodal datasets with 3 distinct scoring functions. For closed-source models lacking token-level logprob access, we develop and validate instruction-guided likelihood proxies. Our findings demonstrate that larger models consistently exhibit better uncertainty quantification; models that know more also know better what they don’t know. More certain models achieve higher accuracy, while mathematical and reasoning tasks elicit poorer uncertainty performance across all models compared to other domains. This work establishes a foundation for reliable uncertainty evaluation in multimodal systems.</abstract>
<identifier type="citekey">azad-etal-2026-art</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.274/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>5185</start>
<end>5201</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Art of Saying “Maybe”: A Conformal Lens for Uncertainty Benchmarking in VLMs
%A Azad, Asif
%A Hossain, Mohammad Sadat
%A Shanto, MD Sadik Hossain
%A Rahman, M. Saifur
%A Parvez, Md Rizwan
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F azad-etal-2026-art
%X Vision-Language Models (VLMs) have achieved remarkable progress in complex visual understanding across scientific and reasoning tasks. While performance benchmarking has advanced our understanding of these capabilities, the critical dimension of uncertainty quantification has received insufficient attention. Therefore, unlike prior conformal prediction studies that focused on limited settings, we conduct a comprehensive uncertainty benchmarking study, evaluating 18 state-of-the-art VLMs (open and closed-source) across 6 multimodal datasets with 3 distinct scoring functions. For closed-source models lacking token-level logprob access, we develop and validate instruction-guided likelihood proxies. Our findings demonstrate that larger models consistently exhibit better uncertainty quantification; models that know more also know better what they don’t know. More certain models achieve higher accuracy, while mathematical and reasoning tasks elicit poorer uncertainty performance across all models compared to other domains. This work establishes a foundation for reliable uncertainty evaluation in multimodal systems.
%U https://aclanthology.org/2026.findings-eacl.274/
%P 5185-5201
Markdown (Informal)
[The Art of Saying "Maybe": A Conformal Lens for Uncertainty Benchmarking in VLMs](https://aclanthology.org/2026.findings-eacl.274/) (Azad et al., Findings 2026)
ACL