@inproceedings{nayak-etal-2025-culturalframes,
title = "{C}ultural{F}rames: Assessing Cultural Expectation Alignment in Text-to-Image Models and Evaluation Metrics",
author = "Nayak, Shravan and
Bhatia, Mehar and
Zhang, Xiaofeng and
Rieser, Verena and
Hendricks, Lisa Anne and
Steenkiste, Sjoerd Van and
Goyal, Yash and
Stanczak, Karolina and
Agrawal, Aishwarya",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1141/",
doi = "10.18653/v1/2025.findings-emnlp.1141",
pages = "20918--20953",
ISBN = "979-8-89176-335-7",
abstract = "The increasing ubiquity of text-to-image (T2I) models as tools for visual content generation raises concerns about their ability to accurately represent diverse cultural contexts - where missed cues can stereotype communities and undermine usability. In this work, we present the first study to systematically quantify the alignment of T2I models and evaluation metrics with respect to both explicit (stated) as well as implicit (unstated, implied by the prompt{'}s cultural context) cultural expectations. To this end, we introduce CulturalFrames, a novel benchmark designed for rigorous human evaluation of cultural representation in visual generations. Spanning 10 countries and 5 socio-cultural domains, CulturalFrames comprises 983 prompts, 3637 corresponding images generated by 4 state-of-the-art T2I models, and over 10k detailed human annotations. We find that across models and countries, cultural expectations are missed an average of 44{\%} of the time. Among these failures, explicit expectations are missed at a surprisingly high average rate of 68{\%}, while implicit expectation failures are also significant, averaging 49{\%}. Furthermore, we show that existing T2I evaluation metrics correlate poorly with human judgments of cultural alignment, irrespective of their internal reasoning. Collectively, our findings expose critical gaps, provide a concrete testbed, and outline actionable directions for developing culturally informed T2I models and metrics that improve global usability."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nayak-etal-2025-culturalframes">
<titleInfo>
<title>CulturalFrames: Assessing Cultural Expectation Alignment in Text-to-Image Models and Evaluation Metrics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shravan</namePart>
<namePart type="family">Nayak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehar</namePart>
<namePart type="family">Bhatia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaofeng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Verena</namePart>
<namePart type="family">Rieser</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lisa</namePart>
<namePart type="given">Anne</namePart>
<namePart type="family">Hendricks</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sjoerd</namePart>
<namePart type="given">Van</namePart>
<namePart type="family">Steenkiste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yash</namePart>
<namePart type="family">Goyal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karolina</namePart>
<namePart type="family">Stanczak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aishwarya</namePart>
<namePart type="family">Agrawal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>The increasing ubiquity of text-to-image (T2I) models as tools for visual content generation raises concerns about their ability to accurately represent diverse cultural contexts - where missed cues can stereotype communities and undermine usability. In this work, we present the first study to systematically quantify the alignment of T2I models and evaluation metrics with respect to both explicit (stated) as well as implicit (unstated, implied by the prompt’s cultural context) cultural expectations. To this end, we introduce CulturalFrames, a novel benchmark designed for rigorous human evaluation of cultural representation in visual generations. Spanning 10 countries and 5 socio-cultural domains, CulturalFrames comprises 983 prompts, 3637 corresponding images generated by 4 state-of-the-art T2I models, and over 10k detailed human annotations. We find that across models and countries, cultural expectations are missed an average of 44% of the time. Among these failures, explicit expectations are missed at a surprisingly high average rate of 68%, while implicit expectation failures are also significant, averaging 49%. Furthermore, we show that existing T2I evaluation metrics correlate poorly with human judgments of cultural alignment, irrespective of their internal reasoning. Collectively, our findings expose critical gaps, provide a concrete testbed, and outline actionable directions for developing culturally informed T2I models and metrics that improve global usability.</abstract>
<identifier type="citekey">nayak-etal-2025-culturalframes</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.1141</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1141/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>20918</start>
<end>20953</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CulturalFrames: Assessing Cultural Expectation Alignment in Text-to-Image Models and Evaluation Metrics
%A Nayak, Shravan
%A Bhatia, Mehar
%A Zhang, Xiaofeng
%A Rieser, Verena
%A Hendricks, Lisa Anne
%A Steenkiste, Sjoerd Van
%A Goyal, Yash
%A Stanczak, Karolina
%A Agrawal, Aishwarya
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F nayak-etal-2025-culturalframes
%X The increasing ubiquity of text-to-image (T2I) models as tools for visual content generation raises concerns about their ability to accurately represent diverse cultural contexts - where missed cues can stereotype communities and undermine usability. In this work, we present the first study to systematically quantify the alignment of T2I models and evaluation metrics with respect to both explicit (stated) as well as implicit (unstated, implied by the prompt’s cultural context) cultural expectations. To this end, we introduce CulturalFrames, a novel benchmark designed for rigorous human evaluation of cultural representation in visual generations. Spanning 10 countries and 5 socio-cultural domains, CulturalFrames comprises 983 prompts, 3637 corresponding images generated by 4 state-of-the-art T2I models, and over 10k detailed human annotations. We find that across models and countries, cultural expectations are missed an average of 44% of the time. Among these failures, explicit expectations are missed at a surprisingly high average rate of 68%, while implicit expectation failures are also significant, averaging 49%. Furthermore, we show that existing T2I evaluation metrics correlate poorly with human judgments of cultural alignment, irrespective of their internal reasoning. Collectively, our findings expose critical gaps, provide a concrete testbed, and outline actionable directions for developing culturally informed T2I models and metrics that improve global usability.
%R 10.18653/v1/2025.findings-emnlp.1141
%U https://aclanthology.org/2025.findings-emnlp.1141/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.1141
%P 20918-20953
Markdown (Informal)
[CulturalFrames: Assessing Cultural Expectation Alignment in Text-to-Image Models and Evaluation Metrics](https://aclanthology.org/2025.findings-emnlp.1141/) (Nayak et al., Findings 2025)
ACL
- Shravan Nayak, Mehar Bhatia, Xiaofeng Zhang, Verena Rieser, Lisa Anne Hendricks, Sjoerd Van Steenkiste, Yash Goyal, Karolina Stanczak, and Aishwarya Agrawal. 2025. CulturalFrames: Assessing Cultural Expectation Alignment in Text-to-Image Models and Evaluation Metrics. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 20918–20953, Suzhou, China. Association for Computational Linguistics.