@inproceedings{raihan-chowdhury-2026-causal,
title = "Causal Localization of the {E}nglish Pivot in {LL}a{VA}: Mechanistic {VLM} Analysis and Training-Free Multilingual Steering",
author = "Raihan, Abrar Zahin and
Chowdhury, Aurchi",
editor = "Huang, Kaiyu and
Mo, Fengran and
Chen, Pinzhen and
Jiang, Meng",
booktitle = "Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models ({M}e{LLM} 2026)",
month = jul,
year = "2026",
address = "San Diego, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.mellm-1.25/",
pages = "257--265",
ISBN = "979-8-89176-430-9",
abstract = "Multilingual vision-language models (VLMs) consistently underperform on non-English visual queries, yet the internal mechanism behind this disparity remains unknown. As a focused case study on LLaVA-1.5-7B, we apply logit-lens analysis and causal activation patching to show that non-English visual queries are routed through an English-biased representational bottleneck in layers 5{--}17, extending the English-pivot phenomenon of Wendler et al. (2024) to the multimodal setting. Peak causal influence occurs at layer 8 ($\overline{\text{AIE}} = 0.49$, averaged across languages), with all measurable pivot signal running through text-token positions. Without meaningful visual content (blank-image condition), language-specific representations do not emerge at any layer, showing that the pivot is image-content-dependent rather than triggered by any visual input. Building on these findings, we derive training-free language-steering vectors at the mechanistically identified pivot layers, improving Russian VQA by +6.5 pp and Portuguese by +4.0 pp on MMMB without any fine-tuning {---} the latter surpassing the English baseline. Within this case study, our results are consistent with the English pivot being a structural property of the LLM backbone that multimodal pre-training does not mitigate; extending this mechanistic methodology to other VLMs and language families remains an important direction for future work."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="raihan-chowdhury-2026-causal">
<titleInfo>
<title>Causal Localization of the English Pivot in LLaVA: Mechanistic VLM Analysis and Training-Free Multilingual Steering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Abrar</namePart>
<namePart type="given">Zahin</namePart>
<namePart type="family">Raihan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aurchi</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models (MeLLM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kaiyu</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fengran</namePart>
<namePart type="family">Mo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pinzhen</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meng</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-430-9</identifier>
</relatedItem>
<abstract>Multilingual vision-language models (VLMs) consistently underperform on non-English visual queries, yet the internal mechanism behind this disparity remains unknown. As a focused case study on LLaVA-1.5-7B, we apply logit-lens analysis and causal activation patching to show that non-English visual queries are routed through an English-biased representational bottleneck in layers 5–17, extending the English-pivot phenomenon of Wendler et al. (2024) to the multimodal setting. Peak causal influence occurs at layer 8 (øverline\textAIE = 0.49, averaged across languages), with all measurable pivot signal running through text-token positions. Without meaningful visual content (blank-image condition), language-specific representations do not emerge at any layer, showing that the pivot is image-content-dependent rather than triggered by any visual input. Building on these findings, we derive training-free language-steering vectors at the mechanistically identified pivot layers, improving Russian VQA by +6.5 pp and Portuguese by +4.0 pp on MMMB without any fine-tuning — the latter surpassing the English baseline. Within this case study, our results are consistent with the English pivot being a structural property of the LLM backbone that multimodal pre-training does not mitigate; extending this mechanistic methodology to other VLMs and language families remains an important direction for future work.</abstract>
<identifier type="citekey">raihan-chowdhury-2026-causal</identifier>
<location>
<url>https://aclanthology.org/2026.mellm-1.25/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>257</start>
<end>265</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Causal Localization of the English Pivot in LLaVA: Mechanistic VLM Analysis and Training-Free Multilingual Steering
%A Raihan, Abrar Zahin
%A Chowdhury, Aurchi
%Y Huang, Kaiyu
%Y Mo, Fengran
%Y Chen, Pinzhen
%Y Jiang, Meng
%S Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models (MeLLM 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, United States
%@ 979-8-89176-430-9
%F raihan-chowdhury-2026-causal
%X Multilingual vision-language models (VLMs) consistently underperform on non-English visual queries, yet the internal mechanism behind this disparity remains unknown. As a focused case study on LLaVA-1.5-7B, we apply logit-lens analysis and causal activation patching to show that non-English visual queries are routed through an English-biased representational bottleneck in layers 5–17, extending the English-pivot phenomenon of Wendler et al. (2024) to the multimodal setting. Peak causal influence occurs at layer 8 (øverline\textAIE = 0.49, averaged across languages), with all measurable pivot signal running through text-token positions. Without meaningful visual content (blank-image condition), language-specific representations do not emerge at any layer, showing that the pivot is image-content-dependent rather than triggered by any visual input. Building on these findings, we derive training-free language-steering vectors at the mechanistically identified pivot layers, improving Russian VQA by +6.5 pp and Portuguese by +4.0 pp on MMMB without any fine-tuning — the latter surpassing the English baseline. Within this case study, our results are consistent with the English pivot being a structural property of the LLM backbone that multimodal pre-training does not mitigate; extending this mechanistic methodology to other VLMs and language families remains an important direction for future work.
%U https://aclanthology.org/2026.mellm-1.25/
%P 257-265
Markdown (Informal)
[Causal Localization of the English Pivot in LLaVA: Mechanistic VLM Analysis and Training-Free Multilingual Steering](https://aclanthology.org/2026.mellm-1.25/) (Raihan & Chowdhury, MeLLM 2026)
ACL