@inproceedings{wu-etal-2026-seeing,
title = "Seeing Beyond Words: {M}at{VQA} for Challenging Visual-Scientific Reasoning in Materials Science",
author = "Wu, Sifan and
Zhang, Huan and
Li, Yizhan and
Effaty, Farshid and
Mei, Hongyuan and
Ataei, Amirreza and
Liu, Bang",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1181/",
doi = "10.18653/v1/2026.findings-acl.1181",
pages = "23599--23614",
ISBN = "979-8-89176-395-1",
abstract = "The emergence of Multimodal Large Language Models (MLLMs) that integrate vision and language modalities has unlocked new potentials for scientific reasoning, outperforming prior benchmarks in both natural language and coding domains. Current materials science evaluation datasets such as MaScQA and SciQA remain largely text-based and fail to capture the visual and research-level analytic complexity required in materials discovery and design. We introduce MatVQA, a scalable benchmark specifically designed to address this gap. Generated via an automated pipeline, MArxivAgent, from recent materials literature, MatVQA features 1672 questions across four critical structure-property-performance (SPP) reasoning tasks. Uniquely, MatVQA employs an iterative process to eliminate textual shortcuts, compelling MLLMs to perform fine-grained, low-level visual analysis of material imagery (e.g., microscopy, diffraction patterns) integrated with multi-step scientific reasoning. Benchmarking 19 open- and closed-source MLLMs on MatVQA reveals substantial gaps in current multimodal reasoning capabilities. The MatVQA benchmark is publicly available[{\ensuremath{<}}https://huggingface.co/datasets/trqcbf/matvqa{\_}v2{\ensuremath{>}}] to facilitate further research on applying MLLMs to complex materials science problems."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wu-etal-2026-seeing">
<titleInfo>
<title>Seeing Beyond Words: MatVQA for Challenging Visual-Scientific Reasoning in Materials Science</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sifan</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yizhan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Farshid</namePart>
<namePart type="family">Effaty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongyuan</namePart>
<namePart type="family">Mei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amirreza</namePart>
<namePart type="family">Ataei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>The emergence of Multimodal Large Language Models (MLLMs) that integrate vision and language modalities has unlocked new potentials for scientific reasoning, outperforming prior benchmarks in both natural language and coding domains. Current materials science evaluation datasets such as MaScQA and SciQA remain largely text-based and fail to capture the visual and research-level analytic complexity required in materials discovery and design. We introduce MatVQA, a scalable benchmark specifically designed to address this gap. Generated via an automated pipeline, MArxivAgent, from recent materials literature, MatVQA features 1672 questions across four critical structure-property-performance (SPP) reasoning tasks. Uniquely, MatVQA employs an iterative process to eliminate textual shortcuts, compelling MLLMs to perform fine-grained, low-level visual analysis of material imagery (e.g., microscopy, diffraction patterns) integrated with multi-step scientific reasoning. Benchmarking 19 open- and closed-source MLLMs on MatVQA reveals substantial gaps in current multimodal reasoning capabilities. The MatVQA benchmark is publicly available[\ensuremath<https://huggingface.co/datasets/trqcbf/matvqa_v2\ensuremath>] to facilitate further research on applying MLLMs to complex materials science problems.</abstract>
<identifier type="citekey">wu-etal-2026-seeing</identifier>
<identifier type="doi">10.18653/v1/2026.findings-acl.1181</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1181/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>23599</start>
<end>23614</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Seeing Beyond Words: MatVQA for Challenging Visual-Scientific Reasoning in Materials Science
%A Wu, Sifan
%A Zhang, Huan
%A Li, Yizhan
%A Effaty, Farshid
%A Mei, Hongyuan
%A Ataei, Amirreza
%A Liu, Bang
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F wu-etal-2026-seeing
%X The emergence of Multimodal Large Language Models (MLLMs) that integrate vision and language modalities has unlocked new potentials for scientific reasoning, outperforming prior benchmarks in both natural language and coding domains. Current materials science evaluation datasets such as MaScQA and SciQA remain largely text-based and fail to capture the visual and research-level analytic complexity required in materials discovery and design. We introduce MatVQA, a scalable benchmark specifically designed to address this gap. Generated via an automated pipeline, MArxivAgent, from recent materials literature, MatVQA features 1672 questions across four critical structure-property-performance (SPP) reasoning tasks. Uniquely, MatVQA employs an iterative process to eliminate textual shortcuts, compelling MLLMs to perform fine-grained, low-level visual analysis of material imagery (e.g., microscopy, diffraction patterns) integrated with multi-step scientific reasoning. Benchmarking 19 open- and closed-source MLLMs on MatVQA reveals substantial gaps in current multimodal reasoning capabilities. The MatVQA benchmark is publicly available[\ensuremath<https://huggingface.co/datasets/trqcbf/matvqa_v2\ensuremath>] to facilitate further research on applying MLLMs to complex materials science problems.
%R 10.18653/v1/2026.findings-acl.1181
%U https://aclanthology.org/2026.findings-acl.1181/
%U https://doi.org/10.18653/v1/2026.findings-acl.1181
%P 23599-23614
Markdown (Informal)
[Seeing Beyond Words: MatVQA for Challenging Visual-Scientific Reasoning in Materials Science](https://aclanthology.org/2026.findings-acl.1181/) (Wu et al., Findings 2026)
ACL
- Sifan Wu, Huan Zhang, Yizhan Li, Farshid Effaty, Hongyuan Mei, Amirreza Ataei, and Bang Liu. 2026. Seeing Beyond Words: MatVQA for Challenging Visual-Scientific Reasoning in Materials Science. In Findings of the Association for Computational Linguistics: ACL 2026, pages 23599–23614, San Diego, California, United States. Association for Computational Linguistics.