@inproceedings{zhang-etal-2025-creating,
title = "Creating a Lens of {C}hinese Culture: A Multimodal Dataset for {C}hinese Pun Rebus Art Understanding",
author = "Zhang, Tuo and
Feng, Tiantian and
Ni, Yibin and
Cao, Mengqin and
Liu, Ruying and
Avestimehr, Kiana and
Butler, Katharine and
Weng, Yanjun and
Zhang, Mi and
Narayanan, Shrikanth and
Avestimehr, Salman",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1155/",
doi = "10.18653/v1/2025.findings-acl.1155",
pages = "22473--22487",
ISBN = "979-8-89176-256-5",
abstract = "Large vision-language models (VLMs) have demonstrated remarkable abilities in understanding everyday content. However, their performance in the domain of art, particularly culturally rich art forms, remains less explored. As a pearl of human wisdom and creativity, art encapsulates complex cultural narratives and symbolism. In this paper, we offer the Pun Rebus Art Dataset, a multimodal dataset for art understanding deeply rooted in traditional Chinese culture. We focus on three primary tasks: identifying salient visual elements, matching elements with their symbolic meanings, and explanations for the conveyed messages. Our evaluation reveals that state-of-the-art VLMs struggle with these tasks, often providing biased and hallucinated explanations and showing limited improvement through in-context learning. By releasing the Pun Rebus Art Dataset, we aim to facilitate the development of VLMs that can better understand and interpret culturally specific content, promoting greater inclusiveness beyond English-based corpora. The dataset and evaluation code are available at [this link](https://github.com/zhang-tuo-pdf/Pun-Rebus-Art-Benchmark)."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2025-creating">
<titleInfo>
<title>Creating a Lens of Chinese Culture: A Multimodal Dataset for Chinese Pun Rebus Art Understanding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tuo</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tiantian</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yibin</namePart>
<namePart type="family">Ni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mengqin</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruying</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kiana</namePart>
<namePart type="family">Avestimehr</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katharine</namePart>
<namePart type="family">Butler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanjun</namePart>
<namePart type="family">Weng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shrikanth</namePart>
<namePart type="family">Narayanan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salman</namePart>
<namePart type="family">Avestimehr</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Large vision-language models (VLMs) have demonstrated remarkable abilities in understanding everyday content. However, their performance in the domain of art, particularly culturally rich art forms, remains less explored. As a pearl of human wisdom and creativity, art encapsulates complex cultural narratives and symbolism. In this paper, we offer the Pun Rebus Art Dataset, a multimodal dataset for art understanding deeply rooted in traditional Chinese culture. We focus on three primary tasks: identifying salient visual elements, matching elements with their symbolic meanings, and explanations for the conveyed messages. Our evaluation reveals that state-of-the-art VLMs struggle with these tasks, often providing biased and hallucinated explanations and showing limited improvement through in-context learning. By releasing the Pun Rebus Art Dataset, we aim to facilitate the development of VLMs that can better understand and interpret culturally specific content, promoting greater inclusiveness beyond English-based corpora. The dataset and evaluation code are available at [this link](https://github.com/zhang-tuo-pdf/Pun-Rebus-Art-Benchmark).</abstract>
<identifier type="citekey">zhang-etal-2025-creating</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1155</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1155/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>22473</start>
<end>22487</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Creating a Lens of Chinese Culture: A Multimodal Dataset for Chinese Pun Rebus Art Understanding
%A Zhang, Tuo
%A Feng, Tiantian
%A Ni, Yibin
%A Cao, Mengqin
%A Liu, Ruying
%A Avestimehr, Kiana
%A Butler, Katharine
%A Weng, Yanjun
%A Zhang, Mi
%A Narayanan, Shrikanth
%A Avestimehr, Salman
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F zhang-etal-2025-creating
%X Large vision-language models (VLMs) have demonstrated remarkable abilities in understanding everyday content. However, their performance in the domain of art, particularly culturally rich art forms, remains less explored. As a pearl of human wisdom and creativity, art encapsulates complex cultural narratives and symbolism. In this paper, we offer the Pun Rebus Art Dataset, a multimodal dataset for art understanding deeply rooted in traditional Chinese culture. We focus on three primary tasks: identifying salient visual elements, matching elements with their symbolic meanings, and explanations for the conveyed messages. Our evaluation reveals that state-of-the-art VLMs struggle with these tasks, often providing biased and hallucinated explanations and showing limited improvement through in-context learning. By releasing the Pun Rebus Art Dataset, we aim to facilitate the development of VLMs that can better understand and interpret culturally specific content, promoting greater inclusiveness beyond English-based corpora. The dataset and evaluation code are available at [this link](https://github.com/zhang-tuo-pdf/Pun-Rebus-Art-Benchmark).
%R 10.18653/v1/2025.findings-acl.1155
%U https://aclanthology.org/2025.findings-acl.1155/
%U https://doi.org/10.18653/v1/2025.findings-acl.1155
%P 22473-22487
Markdown (Informal)
[Creating a Lens of Chinese Culture: A Multimodal Dataset for Chinese Pun Rebus Art Understanding](https://aclanthology.org/2025.findings-acl.1155/) (Zhang et al., Findings 2025)
ACL
- Tuo Zhang, Tiantian Feng, Yibin Ni, Mengqin Cao, Ruying Liu, Kiana Avestimehr, Katharine Butler, Yanjun Weng, Mi Zhang, Shrikanth Narayanan, and Salman Avestimehr. 2025. Creating a Lens of Chinese Culture: A Multimodal Dataset for Chinese Pun Rebus Art Understanding. In Findings of the Association for Computational Linguistics: ACL 2025, pages 22473–22487, Vienna, Austria. Association for Computational Linguistics.