@inproceedings{zhou-etal-2025-human,
title = "On the Human-level Performance of Visual Question Answering",
author = "Zhou, Chenlian and
Chen, Guanyi and
Bai, Xin and
Dong, Ming",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.277/",
pages = "4109--4113",
abstract = "Visual7W has been widely used in assessing multiple-choice visual question-answering (VQA) systems. This paper reports on a replicated human experiment on Visual7W with the aim of understanding the human-level performance of VQA. The replication was not entirely successful because human participants performed significantly worse when answering {\textquotedblleft}where{\textquotedblright}, {\textquotedblleft}when{\textquotedblright}, and {\textquotedblleft}how{\textquotedblright} questions in compared to other question types. An error analysis discovered that the failure was a consequence of the non-deterministic distractors in Visual7W. GPT-4V was then evaluated using and was compared to the human-level performance. The results embody that, when evaluating models' capacity on Visual7W, the performance is not necessarily the higher, the better."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhou-etal-2025-human">
<titleInfo>
<title>On the Human-level Performance of Visual Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chenlian</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guanyi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="family">Bai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Visual7W has been widely used in assessing multiple-choice visual question-answering (VQA) systems. This paper reports on a replicated human experiment on Visual7W with the aim of understanding the human-level performance of VQA. The replication was not entirely successful because human participants performed significantly worse when answering “where”, “when”, and “how” questions in compared to other question types. An error analysis discovered that the failure was a consequence of the non-deterministic distractors in Visual7W. GPT-4V was then evaluated using and was compared to the human-level performance. The results embody that, when evaluating models’ capacity on Visual7W, the performance is not necessarily the higher, the better.</abstract>
<identifier type="citekey">zhou-etal-2025-human</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.277/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>4109</start>
<end>4113</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Human-level Performance of Visual Question Answering
%A Zhou, Chenlian
%A Chen, Guanyi
%A Bai, Xin
%A Dong, Ming
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F zhou-etal-2025-human
%X Visual7W has been widely used in assessing multiple-choice visual question-answering (VQA) systems. This paper reports on a replicated human experiment on Visual7W with the aim of understanding the human-level performance of VQA. The replication was not entirely successful because human participants performed significantly worse when answering “where”, “when”, and “how” questions in compared to other question types. An error analysis discovered that the failure was a consequence of the non-deterministic distractors in Visual7W. GPT-4V was then evaluated using and was compared to the human-level performance. The results embody that, when evaluating models’ capacity on Visual7W, the performance is not necessarily the higher, the better.
%U https://aclanthology.org/2025.coling-main.277/
%P 4109-4113
Markdown (Informal)
[On the Human-level Performance of Visual Question Answering](https://aclanthology.org/2025.coling-main.277/) (Zhou et al., COLING 2025)
ACL