@inproceedings{yang-etal-2025-probench,
title = "{P}ro{B}ench: Judging Multimodal Foundation Models on Open-ended Multi-domain Expert Tasks",
author = "Yang, Yan and
Li, Dongxu and
Wu, Haoning and
Chen, Bei and
Liu, Liu and
Pan, Liyuan and
Li, Junnan",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.568/",
doi = "10.18653/v1/2025.findings-acl.568",
pages = "10883--10892",
ISBN = "979-8-89176-256-5",
abstract = "Solving expert-level multimodal tasks is a key milestone in general intelligence. As the capabilities of multimodal large language models (MLLMs) continue to evolve, evaluation of frontier multimodal intelligence becomes necessary yet challenging. In this work, we introduce ProBench, a benchmark of open-ended user queries encapsulating professional expertise and advanced reasoning. ProBench consists of 4,000 high-quality samples independently collected from professionals based on their productivity demands. It spans across 10 fields and 56 sub-fields, including science, arts, humanities, coding, mathematics, and creative writing. Experimentally, we evaluate and compare 24 latest models using MLLM-as-a-Judge. Our results reveal that although the best open-source models rival the proprietary ones, they all face significant challenges in visual perception, textual understanding, domain knowledge, and advanced reasoning. Our benchmark is publicly accessible at \url{TBC}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yang-etal-2025-probench">
<titleInfo>
<title>ProBench: Judging Multimodal Foundation Models on Open-ended Multi-domain Expert Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yan</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongxu</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haoning</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bei</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liyuan</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junnan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Solving expert-level multimodal tasks is a key milestone in general intelligence. As the capabilities of multimodal large language models (MLLMs) continue to evolve, evaluation of frontier multimodal intelligence becomes necessary yet challenging. In this work, we introduce ProBench, a benchmark of open-ended user queries encapsulating professional expertise and advanced reasoning. ProBench consists of 4,000 high-quality samples independently collected from professionals based on their productivity demands. It spans across 10 fields and 56 sub-fields, including science, arts, humanities, coding, mathematics, and creative writing. Experimentally, we evaluate and compare 24 latest models using MLLM-as-a-Judge. Our results reveal that although the best open-source models rival the proprietary ones, they all face significant challenges in visual perception, textual understanding, domain knowledge, and advanced reasoning. Our benchmark is publicly accessible at TBC.</abstract>
<identifier type="citekey">yang-etal-2025-probench</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.568</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.568/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>10883</start>
<end>10892</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ProBench: Judging Multimodal Foundation Models on Open-ended Multi-domain Expert Tasks
%A Yang, Yan
%A Li, Dongxu
%A Wu, Haoning
%A Chen, Bei
%A Liu, Liu
%A Pan, Liyuan
%A Li, Junnan
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F yang-etal-2025-probench
%X Solving expert-level multimodal tasks is a key milestone in general intelligence. As the capabilities of multimodal large language models (MLLMs) continue to evolve, evaluation of frontier multimodal intelligence becomes necessary yet challenging. In this work, we introduce ProBench, a benchmark of open-ended user queries encapsulating professional expertise and advanced reasoning. ProBench consists of 4,000 high-quality samples independently collected from professionals based on their productivity demands. It spans across 10 fields and 56 sub-fields, including science, arts, humanities, coding, mathematics, and creative writing. Experimentally, we evaluate and compare 24 latest models using MLLM-as-a-Judge. Our results reveal that although the best open-source models rival the proprietary ones, they all face significant challenges in visual perception, textual understanding, domain knowledge, and advanced reasoning. Our benchmark is publicly accessible at TBC.
%R 10.18653/v1/2025.findings-acl.568
%U https://aclanthology.org/2025.findings-acl.568/
%U https://doi.org/10.18653/v1/2025.findings-acl.568
%P 10883-10892
Markdown (Informal)
[ProBench: Judging Multimodal Foundation Models on Open-ended Multi-domain Expert Tasks](https://aclanthology.org/2025.findings-acl.568/) (Yang et al., Findings 2025)
ACL