@inproceedings{kim-etal-2026-visual-interference,
title = "Visual Interference in Speech Evaluation: Cultural Asymmetry and Cross-Modal Bias in {MLLM}s",
author = "Kim, Kyusik and
Yoo, Hyunwoo and
Choi, Jaehoon and
Rosen, Gail and
Suh, Bongwon",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1362/",
pages = "27332--27358",
ISBN = "979-8-89176-395-1",
abstract = "The transition to end-to-end Multimodal Large Language Models (MLLMs) has positioned these architectures as active social evaluators in high-stakes domains. However, it remains unclear whether these models maintain objective auditory perception or succumb to the ``Hearing with Eyes'' phenomenon, where visual racial cues distort linguistic proficiency evaluations. We investigate this cross-modal bias by constructing a controlled counterfactual dataset utilizing a Visual Matched-Guise Paradigm. By pairing identical native audio with diverse visual personas across English and Korean contexts, we reveal a distinct Cultural Asymmetry in model behavior. In Anglophone settings, most closed models exhibit Reverse Linguistic Stereotyping, hallucinating non-native accents for Asian speakers despite standard native audio. Conversely, in Korean settings, the same models assign baseline-relative competence premiums across all visual personas, with the largest gains for out-group (White/Black) speakers, consistent with Expectancy Violation Theory. Our findings demonstrate that MLLMs do not merely process sensory inputs but actively reproduce context-dependent sociolinguistic ideologies."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kim-etal-2026-visual-interference">
<titleInfo>
<title>Visual Interference in Speech Evaluation: Cultural Asymmetry and Cross-Modal Bias in MLLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kyusik</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyunwoo</namePart>
<namePart type="family">Yoo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaehoon</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gail</namePart>
<namePart type="family">Rosen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bongwon</namePart>
<namePart type="family">Suh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>The transition to end-to-end Multimodal Large Language Models (MLLMs) has positioned these architectures as active social evaluators in high-stakes domains. However, it remains unclear whether these models maintain objective auditory perception or succumb to the “Hearing with Eyes” phenomenon, where visual racial cues distort linguistic proficiency evaluations. We investigate this cross-modal bias by constructing a controlled counterfactual dataset utilizing a Visual Matched-Guise Paradigm. By pairing identical native audio with diverse visual personas across English and Korean contexts, we reveal a distinct Cultural Asymmetry in model behavior. In Anglophone settings, most closed models exhibit Reverse Linguistic Stereotyping, hallucinating non-native accents for Asian speakers despite standard native audio. Conversely, in Korean settings, the same models assign baseline-relative competence premiums across all visual personas, with the largest gains for out-group (White/Black) speakers, consistent with Expectancy Violation Theory. Our findings demonstrate that MLLMs do not merely process sensory inputs but actively reproduce context-dependent sociolinguistic ideologies.</abstract>
<identifier type="citekey">kim-etal-2026-visual-interference</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1362/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>27332</start>
<end>27358</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Visual Interference in Speech Evaluation: Cultural Asymmetry and Cross-Modal Bias in MLLMs
%A Kim, Kyusik
%A Yoo, Hyunwoo
%A Choi, Jaehoon
%A Rosen, Gail
%A Suh, Bongwon
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F kim-etal-2026-visual-interference
%X The transition to end-to-end Multimodal Large Language Models (MLLMs) has positioned these architectures as active social evaluators in high-stakes domains. However, it remains unclear whether these models maintain objective auditory perception or succumb to the “Hearing with Eyes” phenomenon, where visual racial cues distort linguistic proficiency evaluations. We investigate this cross-modal bias by constructing a controlled counterfactual dataset utilizing a Visual Matched-Guise Paradigm. By pairing identical native audio with diverse visual personas across English and Korean contexts, we reveal a distinct Cultural Asymmetry in model behavior. In Anglophone settings, most closed models exhibit Reverse Linguistic Stereotyping, hallucinating non-native accents for Asian speakers despite standard native audio. Conversely, in Korean settings, the same models assign baseline-relative competence premiums across all visual personas, with the largest gains for out-group (White/Black) speakers, consistent with Expectancy Violation Theory. Our findings demonstrate that MLLMs do not merely process sensory inputs but actively reproduce context-dependent sociolinguistic ideologies.
%U https://aclanthology.org/2026.findings-acl.1362/
%P 27332-27358
Markdown (Informal)
[Visual Interference in Speech Evaluation: Cultural Asymmetry and Cross-Modal Bias in MLLMs](https://aclanthology.org/2026.findings-acl.1362/) (Kim et al., Findings 2026)
ACL