@inproceedings{eisenschlos-etal-2024-selectively,
title = "Selectively Answering Visual Questions",
author = "Eisenschlos, Julian and
Maina, Hern{\'a}n and
Ivetta, Guido and
Benotti, Luciana",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.250/",
doi = "10.18653/v1/2024.findings-acl.250",
pages = "4219--4229",
abstract = "Recently, large multi-modal models (LMMs) have emerged with the capacity to perform vision tasks such as captioning and visual question answering (VQA) with unprecedented accuracy. Applications such as helping the blind or visually impaired have a critical need for precise answers. It is specially important for models to be well calibrated and be able to quantify their uncertainty in order to selectively decide when to answer and when to abstain or ask for clarifications. We perform the first in-depth analysis of calibration methods and metrics for VQA with in-context learning LMMs. Studying VQA on two answerability benchmarks, we show that the likelihood score of visually grounded models is better calibrated than in their text-only counterparts for in-context learning, where sampling based methods are generally superior, but no clear winner arises. We propose Avg BLEU, a calibration score combining the benefits of both sampling and likelihood methods across modalities."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="eisenschlos-etal-2024-selectively">
<titleInfo>
<title>Selectively Answering Visual Questions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Julian</namePart>
<namePart type="family">Eisenschlos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hernán</namePart>
<namePart type="family">Maina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guido</namePart>
<namePart type="family">Ivetta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luciana</namePart>
<namePart type="family">Benotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recently, large multi-modal models (LMMs) have emerged with the capacity to perform vision tasks such as captioning and visual question answering (VQA) with unprecedented accuracy. Applications such as helping the blind or visually impaired have a critical need for precise answers. It is specially important for models to be well calibrated and be able to quantify their uncertainty in order to selectively decide when to answer and when to abstain or ask for clarifications. We perform the first in-depth analysis of calibration methods and metrics for VQA with in-context learning LMMs. Studying VQA on two answerability benchmarks, we show that the likelihood score of visually grounded models is better calibrated than in their text-only counterparts for in-context learning, where sampling based methods are generally superior, but no clear winner arises. We propose Avg BLEU, a calibration score combining the benefits of both sampling and likelihood methods across modalities.</abstract>
<identifier type="citekey">eisenschlos-etal-2024-selectively</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.250</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.250/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>4219</start>
<end>4229</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Selectively Answering Visual Questions
%A Eisenschlos, Julian
%A Maina, Hernán
%A Ivetta, Guido
%A Benotti, Luciana
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F eisenschlos-etal-2024-selectively
%X Recently, large multi-modal models (LMMs) have emerged with the capacity to perform vision tasks such as captioning and visual question answering (VQA) with unprecedented accuracy. Applications such as helping the blind or visually impaired have a critical need for precise answers. It is specially important for models to be well calibrated and be able to quantify their uncertainty in order to selectively decide when to answer and when to abstain or ask for clarifications. We perform the first in-depth analysis of calibration methods and metrics for VQA with in-context learning LMMs. Studying VQA on two answerability benchmarks, we show that the likelihood score of visually grounded models is better calibrated than in their text-only counterparts for in-context learning, where sampling based methods are generally superior, but no clear winner arises. We propose Avg BLEU, a calibration score combining the benefits of both sampling and likelihood methods across modalities.
%R 10.18653/v1/2024.findings-acl.250
%U https://aclanthology.org/2024.findings-acl.250/
%U https://doi.org/10.18653/v1/2024.findings-acl.250
%P 4219-4229
Markdown (Informal)
[Selectively Answering Visual Questions](https://aclanthology.org/2024.findings-acl.250/) (Eisenschlos et al., Findings 2024)
ACL
- Julian Eisenschlos, Hernán Maina, Guido Ivetta, and Luciana Benotti. 2024. Selectively Answering Visual Questions. In Findings of the Association for Computational Linguistics: ACL 2024, pages 4219–4229, Bangkok, Thailand. Association for Computational Linguistics.