@inproceedings{yuan-etal-2026-dunhuang,
title = "Dunhuang-Bench: How Well Do {MLLM}s Understand Cultural Heritage?",
author = "Yuan, Junyi and
Zhang, Jian and
Yu, Tianxiu and
Zhou, Yanlin and
Jin, Xiaobo and
Wang, Qiufeng and
Wu, Fangyu",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.888/",
pages = "17894--17910",
ISBN = "979-8-89176-395-1",
abstract = "Dunhuang art, a cornerstone of global heritage, demands fine-grained visual perception anchored by specialized cultural knowledge. Given the strong performance of multimodal large language models (MLLMs) on generic multimodal benchmarks, to what extent can they understand artifacts from Dunhuang art that are grounded in cultural context? To this end, we construct Dunhuang-Bench, a large-scale benchmark comprising 486 images and 22,970 QA pairs. It incorporates diverse task formats to evaluate MLLMs' cultural understanding: Question Answering with Text Description, Multi-turn Dialogue, and Question Answering with Choices. Guided by Panofsky{'}s theory of iconology, we design two tasks including visual perception and knowledge reasoning for the evaluation of content understanding. In addition, we follow the theory of formal analytic tradition to design another task of artistic appreciation in our Dunhuang-Bench. Extensive evaluations of 20 mainstream MLLMs on Dunhuang-Bench reveal a consistent performance drop from perception and appreciation to reasoning. Moreover, CoT and few-shot prompting show marginal or negative impact, highlighting the limits of prompting-based improvements. Dunhuang-Bench thus provides a challenging benchmark for advancing multimodal cultural understanding. Data and code will be publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yuan-etal-2026-dunhuang">
<titleInfo>
<title>Dunhuang-Bench: How Well Do MLLMs Understand Cultural Heritage?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Junyi</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianxiu</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanlin</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaobo</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qiufeng</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fangyu</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Dunhuang art, a cornerstone of global heritage, demands fine-grained visual perception anchored by specialized cultural knowledge. Given the strong performance of multimodal large language models (MLLMs) on generic multimodal benchmarks, to what extent can they understand artifacts from Dunhuang art that are grounded in cultural context? To this end, we construct Dunhuang-Bench, a large-scale benchmark comprising 486 images and 22,970 QA pairs. It incorporates diverse task formats to evaluate MLLMs’ cultural understanding: Question Answering with Text Description, Multi-turn Dialogue, and Question Answering with Choices. Guided by Panofsky’s theory of iconology, we design two tasks including visual perception and knowledge reasoning for the evaluation of content understanding. In addition, we follow the theory of formal analytic tradition to design another task of artistic appreciation in our Dunhuang-Bench. Extensive evaluations of 20 mainstream MLLMs on Dunhuang-Bench reveal a consistent performance drop from perception and appreciation to reasoning. Moreover, CoT and few-shot prompting show marginal or negative impact, highlighting the limits of prompting-based improvements. Dunhuang-Bench thus provides a challenging benchmark for advancing multimodal cultural understanding. Data and code will be publicly available.</abstract>
<identifier type="citekey">yuan-etal-2026-dunhuang</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.888/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>17894</start>
<end>17910</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dunhuang-Bench: How Well Do MLLMs Understand Cultural Heritage?
%A Yuan, Junyi
%A Zhang, Jian
%A Yu, Tianxiu
%A Zhou, Yanlin
%A Jin, Xiaobo
%A Wang, Qiufeng
%A Wu, Fangyu
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F yuan-etal-2026-dunhuang
%X Dunhuang art, a cornerstone of global heritage, demands fine-grained visual perception anchored by specialized cultural knowledge. Given the strong performance of multimodal large language models (MLLMs) on generic multimodal benchmarks, to what extent can they understand artifacts from Dunhuang art that are grounded in cultural context? To this end, we construct Dunhuang-Bench, a large-scale benchmark comprising 486 images and 22,970 QA pairs. It incorporates diverse task formats to evaluate MLLMs’ cultural understanding: Question Answering with Text Description, Multi-turn Dialogue, and Question Answering with Choices. Guided by Panofsky’s theory of iconology, we design two tasks including visual perception and knowledge reasoning for the evaluation of content understanding. In addition, we follow the theory of formal analytic tradition to design another task of artistic appreciation in our Dunhuang-Bench. Extensive evaluations of 20 mainstream MLLMs on Dunhuang-Bench reveal a consistent performance drop from perception and appreciation to reasoning. Moreover, CoT and few-shot prompting show marginal or negative impact, highlighting the limits of prompting-based improvements. Dunhuang-Bench thus provides a challenging benchmark for advancing multimodal cultural understanding. Data and code will be publicly available.
%U https://aclanthology.org/2026.findings-acl.888/
%P 17894-17910
Markdown (Informal)
[Dunhuang-Bench: How Well Do MLLMs Understand Cultural Heritage?](https://aclanthology.org/2026.findings-acl.888/) (Yuan et al., Findings 2026)
ACL
- Junyi Yuan, Jian Zhang, Tianxiu Yu, Yanlin Zhou, Xiaobo Jin, Qiufeng Wang, and Fangyu Wu. 2026. Dunhuang-Bench: How Well Do MLLMs Understand Cultural Heritage?. In Findings of the Association for Computational Linguistics: ACL 2026, pages 17894–17910, San Diego, California, United States. Association for Computational Linguistics.