@inproceedings{hung-etal-2025-msr2,
title = "{MSR}$^2$: A Benchmark for Multi-Source Retrieval and Reasoning in Visual Question Answering",
author = "Hung, Kuo-Han and
Fang, Hung-Chieh and
Huang, Chao-Wei and
Chen, Yun-Nung",
editor = "Shi, Weijia and
Yu, Wenhao and
Asai, Akari and
Jiang, Meng and
Durrett, Greg and
Hajishirzi, Hannaneh and
Zettlemoyer, Luke",
booktitle = "Proceedings of the 4th International Workshop on Knowledge-Augmented Methods for Natural Language Processing",
month = may,
year = "2025",
address = "Albuquerque, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.knowledgenlp-1.24/",
doi = "10.18653/v1/2025.knowledgenlp-1.24",
pages = "259--271",
ISBN = "979-8-89176-229-9",
abstract = "This paper introduces MSR$^2$, a benchmark for multi-source retrieval and reasoning in visual question answering. Unlike previous knowledge-based visual question answering datasets, MSR$^2$ focuses on questions involving multiple fine-grained entities, providing a unique opportunity to assess a model{'}s spatial reasoning ability and its capacity to retrieve and aggregate information from various sources for different entities. Through comprehensive evaluation using MSR$^2$, we gain valuable insights into the capabilities and limitations of state-of-the-art large vision-language models (LVLMs).Our findings reveal that even state-of-the-art LVLMs struggle with questions requiring multi-entities and knowledge-intensive reasoning, highlighting important new directions for future research.Additionally, we demonstrate that enhanced visual entity recognition and knowledge retrieval can significantly improve performance on MSR$^2$, pinpointing key areas for advancement."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hung-etal-2025-msr2">
<titleInfo>
<title>MSR²: A Benchmark for Multi-Source Retrieval and Reasoning in Visual Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kuo-Han</namePart>
<namePart type="family">Hung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hung-Chieh</namePart>
<namePart type="family">Fang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chao-Wei</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th International Workshop on Knowledge-Augmented Methods for Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Weijia</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenhao</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akari</namePart>
<namePart type="family">Asai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meng</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Greg</namePart>
<namePart type="family">Durrett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hannaneh</namePart>
<namePart type="family">Hajishirzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luke</namePart>
<namePart type="family">Zettlemoyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-229-9</identifier>
</relatedItem>
<abstract>This paper introduces MSR², a benchmark for multi-source retrieval and reasoning in visual question answering. Unlike previous knowledge-based visual question answering datasets, MSR² focuses on questions involving multiple fine-grained entities, providing a unique opportunity to assess a model’s spatial reasoning ability and its capacity to retrieve and aggregate information from various sources for different entities. Through comprehensive evaluation using MSR², we gain valuable insights into the capabilities and limitations of state-of-the-art large vision-language models (LVLMs).Our findings reveal that even state-of-the-art LVLMs struggle with questions requiring multi-entities and knowledge-intensive reasoning, highlighting important new directions for future research.Additionally, we demonstrate that enhanced visual entity recognition and knowledge retrieval can significantly improve performance on MSR², pinpointing key areas for advancement.</abstract>
<identifier type="citekey">hung-etal-2025-msr2</identifier>
<identifier type="doi">10.18653/v1/2025.knowledgenlp-1.24</identifier>
<location>
<url>https://aclanthology.org/2025.knowledgenlp-1.24/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>259</start>
<end>271</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MSR²: A Benchmark for Multi-Source Retrieval and Reasoning in Visual Question Answering
%A Hung, Kuo-Han
%A Fang, Hung-Chieh
%A Huang, Chao-Wei
%A Chen, Yun-Nung
%Y Shi, Weijia
%Y Yu, Wenhao
%Y Asai, Akari
%Y Jiang, Meng
%Y Durrett, Greg
%Y Hajishirzi, Hannaneh
%Y Zettlemoyer, Luke
%S Proceedings of the 4th International Workshop on Knowledge-Augmented Methods for Natural Language Processing
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, New Mexico, USA
%@ 979-8-89176-229-9
%F hung-etal-2025-msr2
%X This paper introduces MSR², a benchmark for multi-source retrieval and reasoning in visual question answering. Unlike previous knowledge-based visual question answering datasets, MSR² focuses on questions involving multiple fine-grained entities, providing a unique opportunity to assess a model’s spatial reasoning ability and its capacity to retrieve and aggregate information from various sources for different entities. Through comprehensive evaluation using MSR², we gain valuable insights into the capabilities and limitations of state-of-the-art large vision-language models (LVLMs).Our findings reveal that even state-of-the-art LVLMs struggle with questions requiring multi-entities and knowledge-intensive reasoning, highlighting important new directions for future research.Additionally, we demonstrate that enhanced visual entity recognition and knowledge retrieval can significantly improve performance on MSR², pinpointing key areas for advancement.
%R 10.18653/v1/2025.knowledgenlp-1.24
%U https://aclanthology.org/2025.knowledgenlp-1.24/
%U https://doi.org/10.18653/v1/2025.knowledgenlp-1.24
%P 259-271
Markdown (Informal)
[MSR2: A Benchmark for Multi-Source Retrieval and Reasoning in Visual Question Answering](https://aclanthology.org/2025.knowledgenlp-1.24/) (Hung et al., KnowledgeNLP 2025)
ACL