@inproceedings{wu-etal-2025-alignmmbench,
title = "{A}lign{MMB}ench: Evaluating {C}hinese Multimodal Alignment in Large Vision-Language Models",
author = "Wu, Yuhang and
Yu, Wenmeng and
Cheng, Yean and
Wang, Yan and
Zhang, Xiaohan and
Xu, Jiazheng and
Ding, Ming and
Dong, Yuxiao",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.327/",
doi = "10.18653/v1/2025.acl-long.327",
pages = "6541--6558",
ISBN = "979-8-89176-251-0",
abstract = "Evaluating the alignment capabilities of large Vision-Language Models (VLMs) is essential for determining their effectiveness as helpful assistants. However, existing benchmarks primarily focus on basic abilities using nonverbal methods, such as yes-no and multiple-choice questions. In this paper, we address this gap by introducing AlignMMBench, which provides more nuanced evaluations of alignment capabilities and is the first benchmark specifically designed for Chinese visual contexts. This benchmark is meticulously curated from real-world scenarios and internet sources, encompassing thirteen specific tasks across three categories, and includes both single-turn and multi-turn dialogue scenarios. Incorporating a prompt rewrite strategy, AlignMMBench encompasses 1,054 images and 4,978 question-answer pairs. To facilitate the evaluation pipeline, we develop CritiqueVLM, a rule-calibrated evaluator that exceeds GPT-4{'}s evaluation ability. Additionally, we measure the ``alignment score'', a quantitative metric designed to assess the robustness and stability of models across diverse prompts. Finally, we evaluate the performance of representative VLMs on AlignMMBench, offering insights into the capabilities and limitations of different VLM architectures. The evaluation code and data are available at https://github.com/THUDM/AlignMMBench."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wu-etal-2025-alignmmbench">
<titleInfo>
<title>AlignMMBench: Evaluating Chinese Multimodal Alignment in Large Vision-Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuhang</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenmeng</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yean</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaohan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiazheng</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming</namePart>
<namePart type="family">Ding</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuxiao</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Evaluating the alignment capabilities of large Vision-Language Models (VLMs) is essential for determining their effectiveness as helpful assistants. However, existing benchmarks primarily focus on basic abilities using nonverbal methods, such as yes-no and multiple-choice questions. In this paper, we address this gap by introducing AlignMMBench, which provides more nuanced evaluations of alignment capabilities and is the first benchmark specifically designed for Chinese visual contexts. This benchmark is meticulously curated from real-world scenarios and internet sources, encompassing thirteen specific tasks across three categories, and includes both single-turn and multi-turn dialogue scenarios. Incorporating a prompt rewrite strategy, AlignMMBench encompasses 1,054 images and 4,978 question-answer pairs. To facilitate the evaluation pipeline, we develop CritiqueVLM, a rule-calibrated evaluator that exceeds GPT-4’s evaluation ability. Additionally, we measure the “alignment score”, a quantitative metric designed to assess the robustness and stability of models across diverse prompts. Finally, we evaluate the performance of representative VLMs on AlignMMBench, offering insights into the capabilities and limitations of different VLM architectures. The evaluation code and data are available at https://github.com/THUDM/AlignMMBench.</abstract>
<identifier type="citekey">wu-etal-2025-alignmmbench</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.327</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.327/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>6541</start>
<end>6558</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AlignMMBench: Evaluating Chinese Multimodal Alignment in Large Vision-Language Models
%A Wu, Yuhang
%A Yu, Wenmeng
%A Cheng, Yean
%A Wang, Yan
%A Zhang, Xiaohan
%A Xu, Jiazheng
%A Ding, Ming
%A Dong, Yuxiao
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F wu-etal-2025-alignmmbench
%X Evaluating the alignment capabilities of large Vision-Language Models (VLMs) is essential for determining their effectiveness as helpful assistants. However, existing benchmarks primarily focus on basic abilities using nonverbal methods, such as yes-no and multiple-choice questions. In this paper, we address this gap by introducing AlignMMBench, which provides more nuanced evaluations of alignment capabilities and is the first benchmark specifically designed for Chinese visual contexts. This benchmark is meticulously curated from real-world scenarios and internet sources, encompassing thirteen specific tasks across three categories, and includes both single-turn and multi-turn dialogue scenarios. Incorporating a prompt rewrite strategy, AlignMMBench encompasses 1,054 images and 4,978 question-answer pairs. To facilitate the evaluation pipeline, we develop CritiqueVLM, a rule-calibrated evaluator that exceeds GPT-4’s evaluation ability. Additionally, we measure the “alignment score”, a quantitative metric designed to assess the robustness and stability of models across diverse prompts. Finally, we evaluate the performance of representative VLMs on AlignMMBench, offering insights into the capabilities and limitations of different VLM architectures. The evaluation code and data are available at https://github.com/THUDM/AlignMMBench.
%R 10.18653/v1/2025.acl-long.327
%U https://aclanthology.org/2025.acl-long.327/
%U https://doi.org/10.18653/v1/2025.acl-long.327
%P 6541-6558
Markdown (Informal)
[AlignMMBench: Evaluating Chinese Multimodal Alignment in Large Vision-Language Models](https://aclanthology.org/2025.acl-long.327/) (Wu et al., ACL 2025)
ACL
- Yuhang Wu, Wenmeng Yu, Yean Cheng, Yan Wang, Xiaohan Zhang, Jiazheng Xu, Ming Ding, and Yuxiao Dong. 2025. AlignMMBench: Evaluating Chinese Multimodal Alignment in Large Vision-Language Models. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 6541–6558, Vienna, Austria. Association for Computational Linguistics.