@inproceedings{kuhnle-copestake-2019-meaning,
title = "The Meaning of {``}Most{''} for Visual Question Answering Models",
author = "Kuhnle, Alexander and
Copestake, Ann",
editor = "Linzen, Tal and
Chrupa{\l}a, Grzegorz and
Belinkov, Yonatan and
Hupkes, Dieuwke",
booktitle = "Proceedings of the 2019 ACL Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP",
month = aug,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-4806",
doi = "10.18653/v1/W19-4806",
pages = "46--55",
abstract = "The correct interpretation of quantifier statements in the context of a visual scene requires non-trivial inference mechanisms. For the example of {``}most{''}, we discuss two strategies which rely on fundamentally different cognitive concepts. Our aim is to identify what strategy deep learning models for visual question answering learn when trained on such questions. To this end, we carefully design data to replicate experiments from psycholinguistics where the same question was investigated for humans. Focusing on the FiLM visual question answering model, our experiments indicate that a form of approximate number system emerges whose performance declines with more difficult scenes as predicted by Weber{'}s law. Moreover, we identify confounding factors, like spatial arrangement of the scene, which impede the effectiveness of this system.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kuhnle-copestake-2019-meaning">
<titleInfo>
<title>The Meaning of “Most” for Visual Question Answering Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Kuhnle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ann</namePart>
<namePart type="family">Copestake</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2019 ACL Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tal</namePart>
<namePart type="family">Linzen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Grzegorz</namePart>
<namePart type="family">Chrupała</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Belinkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dieuwke</namePart>
<namePart type="family">Hupkes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The correct interpretation of quantifier statements in the context of a visual scene requires non-trivial inference mechanisms. For the example of “most”, we discuss two strategies which rely on fundamentally different cognitive concepts. Our aim is to identify what strategy deep learning models for visual question answering learn when trained on such questions. To this end, we carefully design data to replicate experiments from psycholinguistics where the same question was investigated for humans. Focusing on the FiLM visual question answering model, our experiments indicate that a form of approximate number system emerges whose performance declines with more difficult scenes as predicted by Weber’s law. Moreover, we identify confounding factors, like spatial arrangement of the scene, which impede the effectiveness of this system.</abstract>
<identifier type="citekey">kuhnle-copestake-2019-meaning</identifier>
<identifier type="doi">10.18653/v1/W19-4806</identifier>
<location>
<url>https://aclanthology.org/W19-4806</url>
</location>
<part>
<date>2019-08</date>
<extent unit="page">
<start>46</start>
<end>55</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Meaning of “Most” for Visual Question Answering Models
%A Kuhnle, Alexander
%A Copestake, Ann
%Y Linzen, Tal
%Y Chrupała, Grzegorz
%Y Belinkov, Yonatan
%Y Hupkes, Dieuwke
%S Proceedings of the 2019 ACL Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP
%D 2019
%8 August
%I Association for Computational Linguistics
%C Florence, Italy
%F kuhnle-copestake-2019-meaning
%X The correct interpretation of quantifier statements in the context of a visual scene requires non-trivial inference mechanisms. For the example of “most”, we discuss two strategies which rely on fundamentally different cognitive concepts. Our aim is to identify what strategy deep learning models for visual question answering learn when trained on such questions. To this end, we carefully design data to replicate experiments from psycholinguistics where the same question was investigated for humans. Focusing on the FiLM visual question answering model, our experiments indicate that a form of approximate number system emerges whose performance declines with more difficult scenes as predicted by Weber’s law. Moreover, we identify confounding factors, like spatial arrangement of the scene, which impede the effectiveness of this system.
%R 10.18653/v1/W19-4806
%U https://aclanthology.org/W19-4806
%U https://doi.org/10.18653/v1/W19-4806
%P 46-55
Markdown (Informal)
[The Meaning of “Most” for Visual Question Answering Models](https://aclanthology.org/W19-4806) (Kuhnle & Copestake, BlackboxNLP 2019)
ACL