@inproceedings{nikandrou-etal-2025-crope,
title = "{CROPE}: Evaluating In-Context Adaptation of Vision and Language Models to Culture-Specific Concepts",
author = "Nikandrou, Malvina and
Pantazopoulos, Georgios and
Vitsakis, Nikolas and
Konstas, Ioannis and
Suglia, Alessandro",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.402/",
doi = "10.18653/v1/2025.naacl-long.402",
pages = "7917--7936",
ISBN = "979-8-89176-189-6",
abstract = "As Vision and Language models (VLMs) become accessible across the globe, it is important that they demonstrate cultural knowledge. In his paper, we introduce CROPE, a visual question answering benchmark designed to probe the knowledge of culture-specific concepts and evaluate the capacity for cultural adaptation through contextual information. This allows us to distinguish between parametric knowledge acquired during training and contextual knowledge provided during inference via visual and textual descriptions. Our evaluation of several state-of-the-art open VLMs shows large performance disparities between culture-specific and common concepts in the parametric setting. Moreover, experiments with contextual knowledge indicate that models struggle to effectively utilize multimodal information and bind culture specific concepts to their depictions. Our findings reveal limitations in the cultural understanding and adaptability of current VLMs that need to be addressed toward more culturally inclusive models."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nikandrou-etal-2025-crope">
<titleInfo>
<title>CROPE: Evaluating In-Context Adaptation of Vision and Language Models to Culture-Specific Concepts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Malvina</namePart>
<namePart type="family">Nikandrou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georgios</namePart>
<namePart type="family">Pantazopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolas</namePart>
<namePart type="family">Vitsakis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ioannis</namePart>
<namePart type="family">Konstas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Suglia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>As Vision and Language models (VLMs) become accessible across the globe, it is important that they demonstrate cultural knowledge. In his paper, we introduce CROPE, a visual question answering benchmark designed to probe the knowledge of culture-specific concepts and evaluate the capacity for cultural adaptation through contextual information. This allows us to distinguish between parametric knowledge acquired during training and contextual knowledge provided during inference via visual and textual descriptions. Our evaluation of several state-of-the-art open VLMs shows large performance disparities between culture-specific and common concepts in the parametric setting. Moreover, experiments with contextual knowledge indicate that models struggle to effectively utilize multimodal information and bind culture specific concepts to their depictions. Our findings reveal limitations in the cultural understanding and adaptability of current VLMs that need to be addressed toward more culturally inclusive models.</abstract>
<identifier type="citekey">nikandrou-etal-2025-crope</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.402</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.402/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>7917</start>
<end>7936</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CROPE: Evaluating In-Context Adaptation of Vision and Language Models to Culture-Specific Concepts
%A Nikandrou, Malvina
%A Pantazopoulos, Georgios
%A Vitsakis, Nikolas
%A Konstas, Ioannis
%A Suglia, Alessandro
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F nikandrou-etal-2025-crope
%X As Vision and Language models (VLMs) become accessible across the globe, it is important that they demonstrate cultural knowledge. In his paper, we introduce CROPE, a visual question answering benchmark designed to probe the knowledge of culture-specific concepts and evaluate the capacity for cultural adaptation through contextual information. This allows us to distinguish between parametric knowledge acquired during training and contextual knowledge provided during inference via visual and textual descriptions. Our evaluation of several state-of-the-art open VLMs shows large performance disparities between culture-specific and common concepts in the parametric setting. Moreover, experiments with contextual knowledge indicate that models struggle to effectively utilize multimodal information and bind culture specific concepts to their depictions. Our findings reveal limitations in the cultural understanding and adaptability of current VLMs that need to be addressed toward more culturally inclusive models.
%R 10.18653/v1/2025.naacl-long.402
%U https://aclanthology.org/2025.naacl-long.402/
%U https://doi.org/10.18653/v1/2025.naacl-long.402
%P 7917-7936
Markdown (Informal)
[CROPE: Evaluating In-Context Adaptation of Vision and Language Models to Culture-Specific Concepts](https://aclanthology.org/2025.naacl-long.402/) (Nikandrou et al., NAACL 2025)
ACL