@inproceedings{wu-etal-2026-mmra,
title = "{MMRA}: A Benchmark for Evaluating Multi-Granularity and Multi-Image Relational Association Capabilities in Large Visual Language Models",
author = "Wu, Siwei and
Zhu, King and
Bai, Yu and
Liang, Yiming and
Li, Yizhi and
Wu, Haoning and
Liu, Jiaheng and
Liu, Ruibo and
Qu, Xingwei and
Cheng, Xuxin and
Zhang, Ge and
Huang, Wenhao and
Lin, Chenghua",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.287/",
pages = "5405--5419",
ISBN = "979-8-89176-386-9",
abstract = "Current multi-modal benchmarks primarily focus on facts within individual images. However, they overlook the associative relations among multiple images, which necessitate conducting commonsense reasoning grounded in associated knowledge at different granularities (i.e., image-level and entity-level) as well as the ability to perceive the order of images. Therefore, we propose a multi-image relational association task and a meticulously curated Multi-granularity Multi-image Relational Association (MMRA) benchmark, comprising 1,024 samples. To systematically evaluate current LVLMs, we establish a system of associative relations among images that contains 11 subtasks (e.g., UsageSimilarity, SubEvent, etc.) at two granularity levels (i.e., image-level and entity-level), based on relations in ConceptNet. Our experiments reveal that entity-level multi-image perception tasks pose greater challenges for LVLMs than image-level tasks. Moreover, LVLMs perform poorly on spatial-related tasks, indicating limited spatial awareness. Furthermore, we find that LVLMs exhibit weak image order perception capabilities, and we design a method to significantly improve this ability, demonstrating that most current LVLMs do not adequately consider image order perception during pre-training."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wu-etal-2026-mmra">
<titleInfo>
<title>MMRA: A Benchmark for Evaluating Multi-Granularity and Multi-Image Relational Association Capabilities in Large Visual Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Siwei</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">King</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Bai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yiming</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yizhi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haoning</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaheng</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruibo</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xingwei</namePart>
<namePart type="family">Qu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuxin</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ge</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenhao</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenghua</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>Current multi-modal benchmarks primarily focus on facts within individual images. However, they overlook the associative relations among multiple images, which necessitate conducting commonsense reasoning grounded in associated knowledge at different granularities (i.e., image-level and entity-level) as well as the ability to perceive the order of images. Therefore, we propose a multi-image relational association task and a meticulously curated Multi-granularity Multi-image Relational Association (MMRA) benchmark, comprising 1,024 samples. To systematically evaluate current LVLMs, we establish a system of associative relations among images that contains 11 subtasks (e.g., UsageSimilarity, SubEvent, etc.) at two granularity levels (i.e., image-level and entity-level), based on relations in ConceptNet. Our experiments reveal that entity-level multi-image perception tasks pose greater challenges for LVLMs than image-level tasks. Moreover, LVLMs perform poorly on spatial-related tasks, indicating limited spatial awareness. Furthermore, we find that LVLMs exhibit weak image order perception capabilities, and we design a method to significantly improve this ability, demonstrating that most current LVLMs do not adequately consider image order perception during pre-training.</abstract>
<identifier type="citekey">wu-etal-2026-mmra</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.287/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>5405</start>
<end>5419</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MMRA: A Benchmark for Evaluating Multi-Granularity and Multi-Image Relational Association Capabilities in Large Visual Language Models
%A Wu, Siwei
%A Zhu, King
%A Bai, Yu
%A Liang, Yiming
%A Li, Yizhi
%A Wu, Haoning
%A Liu, Jiaheng
%A Liu, Ruibo
%A Qu, Xingwei
%A Cheng, Xuxin
%A Zhang, Ge
%A Huang, Wenhao
%A Lin, Chenghua
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F wu-etal-2026-mmra
%X Current multi-modal benchmarks primarily focus on facts within individual images. However, they overlook the associative relations among multiple images, which necessitate conducting commonsense reasoning grounded in associated knowledge at different granularities (i.e., image-level and entity-level) as well as the ability to perceive the order of images. Therefore, we propose a multi-image relational association task and a meticulously curated Multi-granularity Multi-image Relational Association (MMRA) benchmark, comprising 1,024 samples. To systematically evaluate current LVLMs, we establish a system of associative relations among images that contains 11 subtasks (e.g., UsageSimilarity, SubEvent, etc.) at two granularity levels (i.e., image-level and entity-level), based on relations in ConceptNet. Our experiments reveal that entity-level multi-image perception tasks pose greater challenges for LVLMs than image-level tasks. Moreover, LVLMs perform poorly on spatial-related tasks, indicating limited spatial awareness. Furthermore, we find that LVLMs exhibit weak image order perception capabilities, and we design a method to significantly improve this ability, demonstrating that most current LVLMs do not adequately consider image order perception during pre-training.
%U https://aclanthology.org/2026.findings-eacl.287/
%P 5405-5419
Markdown (Informal)
[MMRA: A Benchmark for Evaluating Multi-Granularity and Multi-Image Relational Association Capabilities in Large Visual Language Models](https://aclanthology.org/2026.findings-eacl.287/) (Wu et al., Findings 2026)
ACL
- Siwei Wu, King Zhu, Yu Bai, Yiming Liang, Yizhi Li, Haoning Wu, Jiaheng Liu, Ruibo Liu, Xingwei Qu, Xuxin Cheng, Ge Zhang, Wenhao Huang, and Chenghua Lin. 2026. MMRA: A Benchmark for Evaluating Multi-Granularity and Multi-Image Relational Association Capabilities in Large Visual Language Models. In Findings of the Association for Computational Linguistics: EACL 2026, pages 5405–5419, Rabat, Morocco. Association for Computational Linguistics.