@inproceedings{saengsukhiran-etal-2026-evaluating,
title = "Evaluating Perspectival Biases in Cross-Modal Retrieval",
author = "Saengsukhiran, Teerapol and
Chomphooyod, Peerawat and
Rodjananant, Narabodee and
Chaksangchaichot, Chompakorn and
Prakrankamanant, Patawee and
Sripheanpol, Witthawin and
Lovichit, Pak and
Nutanong, Sarana and
Chuangsuwanich, Ekapol",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1795/",
pages = "36018--36049",
ISBN = "979-8-89176-395-1",
abstract = "Multimodal retrieval systems are expected to operate in a semantic space, agnostic to the language or cultural origin of the query. In practice, however, retrieval outcomes systematically reflect perspectival biases: deviations shaped by linguistic **prevalence** and **cultural** associations. We introduce the **Cross-Cultural, Cross-Modal, Cross-lingual Multimodal (3XCM)** benchmark to isolate these effects. Results from our studies indicate that, for image-to-text retrieval, models tend to favor entries from prevalent languages over those that are semantically faithful. For text-to-image retrieval, we observe a consistent ``tugging effect'' in the joint embedding space between semantic alignment and language-conditioned cultural association. When semantic representations are insufficiently resolved, particularly in low-resource languages, similarity is increasingly governed by culturally familiar visual patterns, leading to systematic association bias in retrieval. Our findings suggest that achieving equitable multimodal retrieval necessitates targeted strategies that explicitly decouple language from culture, rather than relying solely on broader data exposure. This work highlights the need to treat linguistic and cultural biases as distinct, measurable challenges in multimodal representation learning."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="saengsukhiran-etal-2026-evaluating">
<titleInfo>
<title>Evaluating Perspectival Biases in Cross-Modal Retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">Teerapol</namePart>
<namePart type="family">Saengsukhiran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peerawat</namePart>
<namePart type="family">Chomphooyod</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Narabodee</namePart>
<namePart type="family">Rodjananant</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chompakorn</namePart>
<namePart type="family">Chaksangchaichot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patawee</namePart>
<namePart type="family">Prakrankamanant</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Witthawin</namePart>
<namePart type="family">Sripheanpol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pak</namePart>
<namePart type="family">Lovichit</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarana</namePart>
<namePart type="family">Nutanong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekapol</namePart>
<namePart type="family">Chuangsuwanich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Multimodal retrieval systems are expected to operate in a semantic space, agnostic to the language or cultural origin of the query. In practice, however, retrieval outcomes systematically reflect perspectival biases: deviations shaped by linguistic **prevalence** and **cultural** associations. We introduce the **Cross-Cultural, Cross-Modal, Cross-lingual Multimodal (3XCM)** benchmark to isolate these effects. Results from our studies indicate that, for image-to-text retrieval, models tend to favor entries from prevalent languages over those that are semantically faithful. For text-to-image retrieval, we observe a consistent “tugging effect” in the joint embedding space between semantic alignment and language-conditioned cultural association. When semantic representations are insufficiently resolved, particularly in low-resource languages, similarity is increasingly governed by culturally familiar visual patterns, leading to systematic association bias in retrieval. Our findings suggest that achieving equitable multimodal retrieval necessitates targeted strategies that explicitly decouple language from culture, rather than relying solely on broader data exposure. This work highlights the need to treat linguistic and cultural biases as distinct, measurable challenges in multimodal representation learning.</abstract>
<identifier type="citekey">saengsukhiran-etal-2026-evaluating</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1795/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>36018</start>
<end>36049</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Perspectival Biases in Cross-Modal Retrieval
%A Saengsukhiran, Teerapol
%A Chomphooyod, Peerawat
%A Rodjananant, Narabodee
%A Chaksangchaichot, Chompakorn
%A Prakrankamanant, Patawee
%A Sripheanpol, Witthawin
%A Lovichit, Pak
%A Nutanong, Sarana
%A Chuangsuwanich, Ekapol
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F saengsukhiran-etal-2026-evaluating
%X Multimodal retrieval systems are expected to operate in a semantic space, agnostic to the language or cultural origin of the query. In practice, however, retrieval outcomes systematically reflect perspectival biases: deviations shaped by linguistic **prevalence** and **cultural** associations. We introduce the **Cross-Cultural, Cross-Modal, Cross-lingual Multimodal (3XCM)** benchmark to isolate these effects. Results from our studies indicate that, for image-to-text retrieval, models tend to favor entries from prevalent languages over those that are semantically faithful. For text-to-image retrieval, we observe a consistent “tugging effect” in the joint embedding space between semantic alignment and language-conditioned cultural association. When semantic representations are insufficiently resolved, particularly in low-resource languages, similarity is increasingly governed by culturally familiar visual patterns, leading to systematic association bias in retrieval. Our findings suggest that achieving equitable multimodal retrieval necessitates targeted strategies that explicitly decouple language from culture, rather than relying solely on broader data exposure. This work highlights the need to treat linguistic and cultural biases as distinct, measurable challenges in multimodal representation learning.
%U https://aclanthology.org/2026.findings-acl.1795/
%P 36018-36049
Markdown (Informal)
[Evaluating Perspectival Biases in Cross-Modal Retrieval](https://aclanthology.org/2026.findings-acl.1795/) (Saengsukhiran et al., Findings 2026)
ACL
- Teerapol Saengsukhiran, Peerawat Chomphooyod, Narabodee Rodjananant, Chompakorn Chaksangchaichot, Patawee Prakrankamanant, Witthawin Sripheanpol, Pak Lovichit, Sarana Nutanong, and Ekapol Chuangsuwanich. 2026. Evaluating Perspectival Biases in Cross-Modal Retrieval. In Findings of the Association for Computational Linguistics: ACL 2026, pages 36018–36049, San Diego, California, United States. Association for Computational Linguistics.