@inproceedings{sinha-etal-2026-minds,
title = "Mind{'}s Eye: A Benchmark of Visual Abstraction, Transformation and Composition for Multimodal {LLM}s",
author = "Sinha, Rohit and
Kanade, Aditya Sanjiv and
Kancheti, Sai Srinivas and
Balasubramanian, Vineeth N. and
Ganu, Tanuja",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.2124/",
pages = "45794--45835",
ISBN = "979-8-89176-390-6",
abstract = "Multimodal large language models (MLLMs) have achieved impressive progress on vision language benchmarks, yet their capacity for \textit{cognitive} and \textit{psychological} reasoning remains largely unexplored. We introduce Mind{'}s Eye, a multiple-choice benchmark of eight visuo-cognitive tasks inspired by classic human intelligence tests and organized under a novel \textbf{A{--}R{--}T} taxonomy: \textbf{Abstraction}, \textbf{Relation}, and \textbf{Transformation}. The tasks probe core processes of fluid intelligence such as pattern induction, analogical Relation mapping, and mental Transformation. We evaluate a diverse suite of closed-source and open-source MLLMs and compare their performance with human participants. Humans achieve \textbf{80{\%}} accuracy, while top performing MLLMs remain below \textbf{50{\%}}. Error analysis reveals failures in (i) visual attention allocation, (ii) internal perceptual manipulation, (iii) over reliance on domain priors, and (iv) weak abstraction of underlying visual concepts. Our findings suggest that current MLLMs exhibit limited fluid reasoning and visuo-cognitive integration compared with human participants, highlighting the need for cognitively grounded evaluation frameworks like Mind{'}s Eye."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sinha-etal-2026-minds">
<titleInfo>
<title>Mind’s Eye: A Benchmark of Visual Abstraction, Transformation and Composition for Multimodal LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rohit</namePart>
<namePart type="family">Sinha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aditya</namePart>
<namePart type="given">Sanjiv</namePart>
<namePart type="family">Kanade</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sai</namePart>
<namePart type="given">Srinivas</namePart>
<namePart type="family">Kancheti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vineeth</namePart>
<namePart type="given">N</namePart>
<namePart type="family">Balasubramanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanuja</namePart>
<namePart type="family">Ganu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Multimodal large language models (MLLMs) have achieved impressive progress on vision language benchmarks, yet their capacity for cognitive and psychological reasoning remains largely unexplored. We introduce Mind’s Eye, a multiple-choice benchmark of eight visuo-cognitive tasks inspired by classic human intelligence tests and organized under a novel A–R–T taxonomy: Abstraction, Relation, and Transformation. The tasks probe core processes of fluid intelligence such as pattern induction, analogical Relation mapping, and mental Transformation. We evaluate a diverse suite of closed-source and open-source MLLMs and compare their performance with human participants. Humans achieve 80% accuracy, while top performing MLLMs remain below 50%. Error analysis reveals failures in (i) visual attention allocation, (ii) internal perceptual manipulation, (iii) over reliance on domain priors, and (iv) weak abstraction of underlying visual concepts. Our findings suggest that current MLLMs exhibit limited fluid reasoning and visuo-cognitive integration compared with human participants, highlighting the need for cognitively grounded evaluation frameworks like Mind’s Eye.</abstract>
<identifier type="citekey">sinha-etal-2026-minds</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.2124/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>45794</start>
<end>45835</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mind’s Eye: A Benchmark of Visual Abstraction, Transformation and Composition for Multimodal LLMs
%A Sinha, Rohit
%A Kanade, Aditya Sanjiv
%A Kancheti, Sai Srinivas
%A Balasubramanian, Vineeth N.
%A Ganu, Tanuja
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F sinha-etal-2026-minds
%X Multimodal large language models (MLLMs) have achieved impressive progress on vision language benchmarks, yet their capacity for cognitive and psychological reasoning remains largely unexplored. We introduce Mind’s Eye, a multiple-choice benchmark of eight visuo-cognitive tasks inspired by classic human intelligence tests and organized under a novel A–R–T taxonomy: Abstraction, Relation, and Transformation. The tasks probe core processes of fluid intelligence such as pattern induction, analogical Relation mapping, and mental Transformation. We evaluate a diverse suite of closed-source and open-source MLLMs and compare their performance with human participants. Humans achieve 80% accuracy, while top performing MLLMs remain below 50%. Error analysis reveals failures in (i) visual attention allocation, (ii) internal perceptual manipulation, (iii) over reliance on domain priors, and (iv) weak abstraction of underlying visual concepts. Our findings suggest that current MLLMs exhibit limited fluid reasoning and visuo-cognitive integration compared with human participants, highlighting the need for cognitively grounded evaluation frameworks like Mind’s Eye.
%U https://aclanthology.org/2026.acl-long.2124/
%P 45794-45835
Markdown (Informal)
[Mind’s Eye: A Benchmark of Visual Abstraction, Transformation and Composition for Multimodal LLMs](https://aclanthology.org/2026.acl-long.2124/) (Sinha et al., ACL 2026)
ACL