@inproceedings{wang-etal-2026-evaluating-cross,
title = "Evaluating Cross-Lingual Behavior and Consistency of Multimodal Large Language Models",
author = "Wang, Hao and
Huang, Pinzhi and
Kawahara, Daisuke",
editor = "Chang, Kai-Wei and
Mehrabi, Ninareh and
Krishna, Satyapriya and
Das, Anubrata and
Dhamala, Jwala and
Cao, Yang Trista and
Kumarage, Tharindu and
Ramakrishna, Anil and
Christodoulopoulos, Christos and
Wan, Yixin and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 6th Workshop on Trustworthy {NLP} ({T}rust{NLP} 2026)",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.trustnlp-main.1/",
pages = "1--20",
ISBN = "979-8-89176-418-7",
abstract = "The rapid evolution of multimodal large language models (MLLMs) has significantly enhanced their real-world applications.However, achieving consistent performance across languages, especially when integrating cultural knowledge, remains a significant challenge.To better assess this issue, we introduce two new benchmarks: KnowRecall and VisRecall, which evaluate cross-lingual consistency in MLLMs.KnowRecall is a visual question answering benchmark designed to measure factual knowledge consistency in 15 languages, focusing on cultural and historical questions about global landmarks.VisRecall assesses visual memory consistency by asking models to describe landmark appearances in 9 languages without access to images.Experimental results reveal that state-of-the-art MLLMs, including proprietary ones, still struggle to achieve cross-lingual consistency.This underscores the need for more robust approaches that produce truly multilingual and culturally aware models."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2026-evaluating-cross">
<titleInfo>
<title>Evaluating Cross-Lingual Behavior and Consistency of Multimodal Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pinzhi</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daisuke</namePart>
<namePart type="family">Kawahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th Workshop on Trustworthy NLP (TrustNLP 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ninareh</namePart>
<namePart type="family">Mehrabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Satyapriya</namePart>
<namePart type="family">Krishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anubrata</namePart>
<namePart type="family">Das</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jwala</namePart>
<namePart type="family">Dhamala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="given">Trista</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Kumarage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anil</namePart>
<namePart type="family">Ramakrishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yixin</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aram</namePart>
<namePart type="family">Galystan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-418-7</identifier>
</relatedItem>
<abstract>The rapid evolution of multimodal large language models (MLLMs) has significantly enhanced their real-world applications.However, achieving consistent performance across languages, especially when integrating cultural knowledge, remains a significant challenge.To better assess this issue, we introduce two new benchmarks: KnowRecall and VisRecall, which evaluate cross-lingual consistency in MLLMs.KnowRecall is a visual question answering benchmark designed to measure factual knowledge consistency in 15 languages, focusing on cultural and historical questions about global landmarks.VisRecall assesses visual memory consistency by asking models to describe landmark appearances in 9 languages without access to images.Experimental results reveal that state-of-the-art MLLMs, including proprietary ones, still struggle to achieve cross-lingual consistency.This underscores the need for more robust approaches that produce truly multilingual and culturally aware models.</abstract>
<identifier type="citekey">wang-etal-2026-evaluating-cross</identifier>
<location>
<url>https://aclanthology.org/2026.trustnlp-main.1/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1</start>
<end>20</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Cross-Lingual Behavior and Consistency of Multimodal Large Language Models
%A Wang, Hao
%A Huang, Pinzhi
%A Kawahara, Daisuke
%Y Chang, Kai-Wei
%Y Mehrabi, Ninareh
%Y Krishna, Satyapriya
%Y Das, Anubrata
%Y Dhamala, Jwala
%Y Cao, Yang Trista
%Y Kumarage, Tharindu
%Y Ramakrishna, Anil
%Y Christodoulopoulos, Christos
%Y Wan, Yixin
%Y Galystan, Aram
%Y Kumar, Anoop
%Y Gupta, Rahul
%S Proceedings of the 6th Workshop on Trustworthy NLP (TrustNLP 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-418-7
%F wang-etal-2026-evaluating-cross
%X The rapid evolution of multimodal large language models (MLLMs) has significantly enhanced their real-world applications.However, achieving consistent performance across languages, especially when integrating cultural knowledge, remains a significant challenge.To better assess this issue, we introduce two new benchmarks: KnowRecall and VisRecall, which evaluate cross-lingual consistency in MLLMs.KnowRecall is a visual question answering benchmark designed to measure factual knowledge consistency in 15 languages, focusing on cultural and historical questions about global landmarks.VisRecall assesses visual memory consistency by asking models to describe landmark appearances in 9 languages without access to images.Experimental results reveal that state-of-the-art MLLMs, including proprietary ones, still struggle to achieve cross-lingual consistency.This underscores the need for more robust approaches that produce truly multilingual and culturally aware models.
%U https://aclanthology.org/2026.trustnlp-main.1/
%P 1-20
Markdown (Informal)
[Evaluating Cross-Lingual Behavior and Consistency of Multimodal Large Language Models](https://aclanthology.org/2026.trustnlp-main.1/) (Wang et al., TrustNLP 2026)
ACL