@inproceedings{reich-schultz-2024-uncovering,
title = "Uncovering the Full Potential of Visual Grounding Methods in {VQA}",
author = "Reich, Daniel and
Schultz, Tanja",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-long.241/",
doi = "10.18653/v1/2024.acl-long.241",
pages = "4406--4419",
abstract = "Visual Grounding (VG) methods in Visual Question Answering (VQA) attempt to improve VQA performance by strengthening a model`s reliance on question-relevant visual information. The presence of such relevant information in the visual input is typically assumed in training and testing. This assumption, however, is inherently flawed when dealing with imperfect image representations common in large-scale VQA, where the information carried by visual features frequently deviates from expected ground-truth contents. As a result, training and testing of VG-methods is performed with largely inaccurate data, which obstructs proper assessment of their potential benefits.In this study, we demonstrate that current evaluation schemes for VG-methods are problematic due to the flawed assumption of availability of relevant visual information. Our experiments show that these methods can be much more effective when evaluation conditions are corrected. Code is provided."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="reich-schultz-2024-uncovering">
<titleInfo>
<title>Uncovering the Full Potential of Visual Grounding Methods in VQA</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Reich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanja</namePart>
<namePart type="family">Schultz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Visual Grounding (VG) methods in Visual Question Answering (VQA) attempt to improve VQA performance by strengthening a model‘s reliance on question-relevant visual information. The presence of such relevant information in the visual input is typically assumed in training and testing. This assumption, however, is inherently flawed when dealing with imperfect image representations common in large-scale VQA, where the information carried by visual features frequently deviates from expected ground-truth contents. As a result, training and testing of VG-methods is performed with largely inaccurate data, which obstructs proper assessment of their potential benefits.In this study, we demonstrate that current evaluation schemes for VG-methods are problematic due to the flawed assumption of availability of relevant visual information. Our experiments show that these methods can be much more effective when evaluation conditions are corrected. Code is provided.</abstract>
<identifier type="citekey">reich-schultz-2024-uncovering</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.241</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-long.241/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>4406</start>
<end>4419</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Uncovering the Full Potential of Visual Grounding Methods in VQA
%A Reich, Daniel
%A Schultz, Tanja
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F reich-schultz-2024-uncovering
%X Visual Grounding (VG) methods in Visual Question Answering (VQA) attempt to improve VQA performance by strengthening a model‘s reliance on question-relevant visual information. The presence of such relevant information in the visual input is typically assumed in training and testing. This assumption, however, is inherently flawed when dealing with imperfect image representations common in large-scale VQA, where the information carried by visual features frequently deviates from expected ground-truth contents. As a result, training and testing of VG-methods is performed with largely inaccurate data, which obstructs proper assessment of their potential benefits.In this study, we demonstrate that current evaluation schemes for VG-methods are problematic due to the flawed assumption of availability of relevant visual information. Our experiments show that these methods can be much more effective when evaluation conditions are corrected. Code is provided.
%R 10.18653/v1/2024.acl-long.241
%U https://aclanthology.org/2024.luhme-long.241/
%U https://doi.org/10.18653/v1/2024.acl-long.241
%P 4406-4419
Markdown (Informal)
[Uncovering the Full Potential of Visual Grounding Methods in VQA](https://aclanthology.org/2024.luhme-long.241/) (Reich & Schultz, ACL 2024)
ACL