@inproceedings{kim-etal-2025-tom,
title = "{WHEN} {TOM} {EATS} {KIMCHI}: Evaluating Cultural Awareness of Multimodal Large Language Models in Cultural Mixture Contexts",
author = "Kim, Jun Seong and
Thu, Kyaw Ye and
Ismayilzada, Javad and
Park, Junyeong and
Kim, Eunsu and
Ahmad, Huzama and
An, Na Min and
Thorne, James and
Oh, Alice",
editor = "Prabhakaran, Vinodkumar and
Dev, Sunipa and
Benotti, Luciana and
Hershcovich, Daniel and
Cao, Yong and
Zhou, Li and
Cabello, Laura and
Adebara, Ife",
booktitle = "Proceedings of the 3rd Workshop on Cross-Cultural Considerations in NLP (C3NLP 2025)",
month = may,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.c3nlp-1.11/",
doi = "10.18653/v1/2025.c3nlp-1.11",
pages = "143--154",
ISBN = "979-8-89176-237-4",
abstract = "In a highly globalized world, it is important for multi-modal large language models (MLLMs) to recognize and respond correctly to mixed-cultural inputs.For example, a model should correctly identify kimchi (Korean food) in an image both when an Asian woman is eating it, as well as an African man is eating it.However, current MLLMs show an over-reliance on the visual features of the person, leading to misclassification of the entities. To examine the robustness of MLLMs to different ethnicity, we introduce MIXCUBE, a cross-cultural bias benchmark, and study elements from five countries and four ethnicities. Our findings reveal that MLLMs achieve both higher accuracy and lower sensitivity to such perturbation for high-resource cultures, but not for low-resource cultures. GPT-4o, the best-performing model overall, shows up to 58{\%} difference in accuracy between the original and perturbed cultural settings in low-resource cultures"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kim-etal-2025-tom">
<titleInfo>
<title>WHEN TOM EATS KIMCHI: Evaluating Cultural Awareness of Multimodal Large Language Models in Cultural Mixture Contexts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="given">Seong</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyaw</namePart>
<namePart type="given">Ye</namePart>
<namePart type="family">Thu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Javad</namePart>
<namePart type="family">Ismayilzada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junyeong</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eunsu</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huzama</namePart>
<namePart type="family">Ahmad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Na</namePart>
<namePart type="given">Min</namePart>
<namePart type="family">An</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Thorne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alice</namePart>
<namePart type="family">Oh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Workshop on Cross-Cultural Considerations in NLP (C3NLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vinodkumar</namePart>
<namePart type="family">Prabhakaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sunipa</namePart>
<namePart type="family">Dev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luciana</namePart>
<namePart type="family">Benotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Hershcovich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yong</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laura</namePart>
<namePart type="family">Cabello</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ife</namePart>
<namePart type="family">Adebara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-237-4</identifier>
</relatedItem>
<abstract>In a highly globalized world, it is important for multi-modal large language models (MLLMs) to recognize and respond correctly to mixed-cultural inputs.For example, a model should correctly identify kimchi (Korean food) in an image both when an Asian woman is eating it, as well as an African man is eating it.However, current MLLMs show an over-reliance on the visual features of the person, leading to misclassification of the entities. To examine the robustness of MLLMs to different ethnicity, we introduce MIXCUBE, a cross-cultural bias benchmark, and study elements from five countries and four ethnicities. Our findings reveal that MLLMs achieve both higher accuracy and lower sensitivity to such perturbation for high-resource cultures, but not for low-resource cultures. GPT-4o, the best-performing model overall, shows up to 58% difference in accuracy between the original and perturbed cultural settings in low-resource cultures</abstract>
<identifier type="citekey">kim-etal-2025-tom</identifier>
<identifier type="doi">10.18653/v1/2025.c3nlp-1.11</identifier>
<location>
<url>https://aclanthology.org/2025.c3nlp-1.11/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>143</start>
<end>154</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T WHEN TOM EATS KIMCHI: Evaluating Cultural Awareness of Multimodal Large Language Models in Cultural Mixture Contexts
%A Kim, Jun Seong
%A Thu, Kyaw Ye
%A Ismayilzada, Javad
%A Park, Junyeong
%A Kim, Eunsu
%A Ahmad, Huzama
%A An, Na Min
%A Thorne, James
%A Oh, Alice
%Y Prabhakaran, Vinodkumar
%Y Dev, Sunipa
%Y Benotti, Luciana
%Y Hershcovich, Daniel
%Y Cao, Yong
%Y Zhou, Li
%Y Cabello, Laura
%Y Adebara, Ife
%S Proceedings of the 3rd Workshop on Cross-Cultural Considerations in NLP (C3NLP 2025)
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-237-4
%F kim-etal-2025-tom
%X In a highly globalized world, it is important for multi-modal large language models (MLLMs) to recognize and respond correctly to mixed-cultural inputs.For example, a model should correctly identify kimchi (Korean food) in an image both when an Asian woman is eating it, as well as an African man is eating it.However, current MLLMs show an over-reliance on the visual features of the person, leading to misclassification of the entities. To examine the robustness of MLLMs to different ethnicity, we introduce MIXCUBE, a cross-cultural bias benchmark, and study elements from five countries and four ethnicities. Our findings reveal that MLLMs achieve both higher accuracy and lower sensitivity to such perturbation for high-resource cultures, but not for low-resource cultures. GPT-4o, the best-performing model overall, shows up to 58% difference in accuracy between the original and perturbed cultural settings in low-resource cultures
%R 10.18653/v1/2025.c3nlp-1.11
%U https://aclanthology.org/2025.c3nlp-1.11/
%U https://doi.org/10.18653/v1/2025.c3nlp-1.11
%P 143-154
Markdown (Informal)
[WHEN TOM EATS KIMCHI: Evaluating Cultural Awareness of Multimodal Large Language Models in Cultural Mixture Contexts](https://aclanthology.org/2025.c3nlp-1.11/) (Kim et al., C3NLP 2025)
ACL
- Jun Seong Kim, Kyaw Ye Thu, Javad Ismayilzada, Junyeong Park, Eunsu Kim, Huzama Ahmad, Na Min An, James Thorne, and Alice Oh. 2025. WHEN TOM EATS KIMCHI: Evaluating Cultural Awareness of Multimodal Large Language Models in Cultural Mixture Contexts. In Proceedings of the 3rd Workshop on Cross-Cultural Considerations in NLP (C3NLP 2025), pages 143–154, Albuquerque, New Mexico. Association for Computational Linguistics.