@inproceedings{zhao-etal-2025-multimodal,
title = "Are Multimodal {LLM}s Robust Against Adversarial Perturbations? {R}o{MM}ath: A Systematic Evaluation on Multimodal Math Reasoning",
author = "Zhao, Yilun and
Gan, Guo and
Wang, Chengye and
Zhao, Chen and
Cohan, Arman",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.582/",
doi = "10.18653/v1/2025.naacl-long.582",
pages = "11653--11665",
ISBN = "979-8-89176-189-6",
abstract = "We introduce RoMMath, the first benchmark designed to evaluate the capabilities and robustness of multimodal large language models (MLLMs) in handling multimodal math reasoning, particularly when faced with adversarial perturbations. RoMMath consists of 4,800 expert-annotated examples, including an original set and seven adversarial sets, each targeting a specific type of perturbation at the text or vision levels. We evaluate a broad spectrum of 17 MLLMs on RoMMath and uncover a critical challenge regarding model robustness against adversarial perturbations. Through detailed error analysis by human experts, we gain a deeper understanding of the current limitations of MLLMs. Additionally, we explore various approaches to enhance the performance and robustness of MLLMs, providing insights that can guide future research efforts."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhao-etal-2025-multimodal">
<titleInfo>
<title>Are Multimodal LLMs Robust Against Adversarial Perturbations? RoMMath: A Systematic Evaluation on Multimodal Math Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yilun</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guo</namePart>
<namePart type="family">Gan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengye</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arman</namePart>
<namePart type="family">Cohan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>We introduce RoMMath, the first benchmark designed to evaluate the capabilities and robustness of multimodal large language models (MLLMs) in handling multimodal math reasoning, particularly when faced with adversarial perturbations. RoMMath consists of 4,800 expert-annotated examples, including an original set and seven adversarial sets, each targeting a specific type of perturbation at the text or vision levels. We evaluate a broad spectrum of 17 MLLMs on RoMMath and uncover a critical challenge regarding model robustness against adversarial perturbations. Through detailed error analysis by human experts, we gain a deeper understanding of the current limitations of MLLMs. Additionally, we explore various approaches to enhance the performance and robustness of MLLMs, providing insights that can guide future research efforts.</abstract>
<identifier type="citekey">zhao-etal-2025-multimodal</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.582</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.582/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>11653</start>
<end>11665</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Are Multimodal LLMs Robust Against Adversarial Perturbations? RoMMath: A Systematic Evaluation on Multimodal Math Reasoning
%A Zhao, Yilun
%A Gan, Guo
%A Wang, Chengye
%A Zhao, Chen
%A Cohan, Arman
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F zhao-etal-2025-multimodal
%X We introduce RoMMath, the first benchmark designed to evaluate the capabilities and robustness of multimodal large language models (MLLMs) in handling multimodal math reasoning, particularly when faced with adversarial perturbations. RoMMath consists of 4,800 expert-annotated examples, including an original set and seven adversarial sets, each targeting a specific type of perturbation at the text or vision levels. We evaluate a broad spectrum of 17 MLLMs on RoMMath and uncover a critical challenge regarding model robustness against adversarial perturbations. Through detailed error analysis by human experts, we gain a deeper understanding of the current limitations of MLLMs. Additionally, we explore various approaches to enhance the performance and robustness of MLLMs, providing insights that can guide future research efforts.
%R 10.18653/v1/2025.naacl-long.582
%U https://aclanthology.org/2025.naacl-long.582/
%U https://doi.org/10.18653/v1/2025.naacl-long.582
%P 11653-11665
Markdown (Informal)
[Are Multimodal LLMs Robust Against Adversarial Perturbations? RoMMath: A Systematic Evaluation on Multimodal Math Reasoning](https://aclanthology.org/2025.naacl-long.582/) (Zhao et al., NAACL 2025)
ACL