@inproceedings{bharati-etal-2026-mirage,
title = "The Mirage of Diversity: Unmasking the Cultural Vocabulary Ceiling in {LLM}s",
author = "Bharati, Soumedhik and
Mukherjee, Subhrajit and
Mandal, Shibam",
editor = "Prabhakaran, Vinodkumar and
Dev, Sunipa and
Benotti, Luciana and
Hershcovich, Daniel and
Cao, Yong and
Zhou, Li and
Ma, BOlei and
Adebara, Ife",
booktitle = "Proceedings of the 4th Workshop on Cross-Cultural Considerations in {NLP} ({C}3{NLP} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.c3nlp-1.7/",
pages = "101--107",
ISBN = "979-8-89176-420-0",
abstract = "Large Language Models are widely used to generate and adapt cultural texts, yet the depth of their cultural representation remains poorly quantified. Intuitively, as a narrative text expands in length, the diversity of cultural words should scale proportionately. To formally test this, we evaluate the FairyTaleQA dataset, adapted by three models and introduce our primary contribution: the Contextual Stereotype Amplification Index (CSAI), an evaluation framework combining LLM-as-a-judge extraction, embedding-based clich{\'e} anchoring, and Natural Language Inference (NLI) congruence validation. By mapping the frequency of extracted Culture Specific Items (CSIs) against narrative length using Heaps' Law ($V = k \cdot T^\beta$), we present empirical evidence of a systematic limitation in current systems: they struggle to scale cultural diversity even under explicit cultural prompting. Models rapidly hit a ``Cultural Vocabulary Ceiling,'' constrained to a fixed set of hyper-stereotypical terms. Furthermore, we demonstrate that merely optimizing for higher CSI frequency as done in prior works rewards logically broken tokenism. Our CSAI formulation actively penalizes such gratuitous stereotyping, offering a more principled approach to measuring and evaluating cultural homogenization in generative AI systems."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bharati-etal-2026-mirage">
<titleInfo>
<title>The Mirage of Diversity: Unmasking the Cultural Vocabulary Ceiling in LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Soumedhik</namePart>
<namePart type="family">Bharati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Subhrajit</namePart>
<namePart type="family">Mukherjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shibam</namePart>
<namePart type="family">Mandal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Cross-Cultural Considerations in NLP (C3NLP 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vinodkumar</namePart>
<namePart type="family">Prabhakaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sunipa</namePart>
<namePart type="family">Dev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luciana</namePart>
<namePart type="family">Benotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Hershcovich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yong</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">BOlei</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ife</namePart>
<namePart type="family">Adebara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-420-0</identifier>
</relatedItem>
<abstract>Large Language Models are widely used to generate and adapt cultural texts, yet the depth of their cultural representation remains poorly quantified. Intuitively, as a narrative text expands in length, the diversity of cultural words should scale proportionately. To formally test this, we evaluate the FairyTaleQA dataset, adapted by three models and introduce our primary contribution: the Contextual Stereotype Amplification Index (CSAI), an evaluation framework combining LLM-as-a-judge extraction, embedding-based cliché anchoring, and Natural Language Inference (NLI) congruence validation. By mapping the frequency of extracted Culture Specific Items (CSIs) against narrative length using Heaps’ Law (V = k · T^β), we present empirical evidence of a systematic limitation in current systems: they struggle to scale cultural diversity even under explicit cultural prompting. Models rapidly hit a “Cultural Vocabulary Ceiling,” constrained to a fixed set of hyper-stereotypical terms. Furthermore, we demonstrate that merely optimizing for higher CSI frequency as done in prior works rewards logically broken tokenism. Our CSAI formulation actively penalizes such gratuitous stereotyping, offering a more principled approach to measuring and evaluating cultural homogenization in generative AI systems.</abstract>
<identifier type="citekey">bharati-etal-2026-mirage</identifier>
<location>
<url>https://aclanthology.org/2026.c3nlp-1.7/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>101</start>
<end>107</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Mirage of Diversity: Unmasking the Cultural Vocabulary Ceiling in LLMs
%A Bharati, Soumedhik
%A Mukherjee, Subhrajit
%A Mandal, Shibam
%Y Prabhakaran, Vinodkumar
%Y Dev, Sunipa
%Y Benotti, Luciana
%Y Hershcovich, Daniel
%Y Cao, Yong
%Y Zhou, Li
%Y Ma, BOlei
%Y Adebara, Ife
%S Proceedings of the 4th Workshop on Cross-Cultural Considerations in NLP (C3NLP 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-420-0
%F bharati-etal-2026-mirage
%X Large Language Models are widely used to generate and adapt cultural texts, yet the depth of their cultural representation remains poorly quantified. Intuitively, as a narrative text expands in length, the diversity of cultural words should scale proportionately. To formally test this, we evaluate the FairyTaleQA dataset, adapted by three models and introduce our primary contribution: the Contextual Stereotype Amplification Index (CSAI), an evaluation framework combining LLM-as-a-judge extraction, embedding-based cliché anchoring, and Natural Language Inference (NLI) congruence validation. By mapping the frequency of extracted Culture Specific Items (CSIs) against narrative length using Heaps’ Law (V = k · T^β), we present empirical evidence of a systematic limitation in current systems: they struggle to scale cultural diversity even under explicit cultural prompting. Models rapidly hit a “Cultural Vocabulary Ceiling,” constrained to a fixed set of hyper-stereotypical terms. Furthermore, we demonstrate that merely optimizing for higher CSI frequency as done in prior works rewards logically broken tokenism. Our CSAI formulation actively penalizes such gratuitous stereotyping, offering a more principled approach to measuring and evaluating cultural homogenization in generative AI systems.
%U https://aclanthology.org/2026.c3nlp-1.7/
%P 101-107
Markdown (Informal)
[The Mirage of Diversity: Unmasking the Cultural Vocabulary Ceiling in LLMs](https://aclanthology.org/2026.c3nlp-1.7/) (Bharati et al., C3NLP 2026)
ACL