@inproceedings{yan-etal-2026-beyond,
title = "Beyond Cross-Modal Alignment: Measuring and Leveraging Modality Gap in Vision-Language Models",
author = "Yan, Hanqi and
Cui, Xiangxiang and
Yin, Lu and
Gu, Jindong and
Liang, Paul Pu and
He, Yulan and
Wang, Yifei",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.588/",
pages = "12123--12138",
ISBN = "979-8-89176-395-1",
abstract = "The success of vision-language models is primarily attributed to effective cross-modal alignment between vision and language. However, modality gaps persist even in well-aligned models and may be necessary for human perception, as evidenced by modality-specific phenomena such as visual texture and linguistic tone. These observations motivate us to computationally measure and leverage modality gaps to explore their utility in downstream applications. In this paper, we introduce the \textbf{M}odality \textbf{D}ominance \textbf{S}core (\textbf{MDS}), which attributes multimodal features to specific modalities by categorizing them as vision-dominant, language-dominant, or cross-modal. We then propose automatic interpretability metrics to evaluate these modality-specific features in a scalable manner. Finally, we demonstrate how the identified modality-specific features enable training-free probing and editing methods for understanding model perception across genders, generating adversarial examples, and controlling text-to-image generation. Combined with task-agnostic interpretability tools, our work provides a systematic framework for analyzing and efficiently controlling multimodal models."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yan-etal-2026-beyond">
<titleInfo>
<title>Beyond Cross-Modal Alignment: Measuring and Leveraging Modality Gap in Vision-Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hanqi</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiangxiang</namePart>
<namePart type="family">Cui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Yin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jindong</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="given">Pu</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yifei</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>The success of vision-language models is primarily attributed to effective cross-modal alignment between vision and language. However, modality gaps persist even in well-aligned models and may be necessary for human perception, as evidenced by modality-specific phenomena such as visual texture and linguistic tone. These observations motivate us to computationally measure and leverage modality gaps to explore their utility in downstream applications. In this paper, we introduce the Modality Dominance Score (MDS), which attributes multimodal features to specific modalities by categorizing them as vision-dominant, language-dominant, or cross-modal. We then propose automatic interpretability metrics to evaluate these modality-specific features in a scalable manner. Finally, we demonstrate how the identified modality-specific features enable training-free probing and editing methods for understanding model perception across genders, generating adversarial examples, and controlling text-to-image generation. Combined with task-agnostic interpretability tools, our work provides a systematic framework for analyzing and efficiently controlling multimodal models.</abstract>
<identifier type="citekey">yan-etal-2026-beyond</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.588/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>12123</start>
<end>12138</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Beyond Cross-Modal Alignment: Measuring and Leveraging Modality Gap in Vision-Language Models
%A Yan, Hanqi
%A Cui, Xiangxiang
%A Yin, Lu
%A Gu, Jindong
%A Liang, Paul Pu
%A He, Yulan
%A Wang, Yifei
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F yan-etal-2026-beyond
%X The success of vision-language models is primarily attributed to effective cross-modal alignment between vision and language. However, modality gaps persist even in well-aligned models and may be necessary for human perception, as evidenced by modality-specific phenomena such as visual texture and linguistic tone. These observations motivate us to computationally measure and leverage modality gaps to explore their utility in downstream applications. In this paper, we introduce the Modality Dominance Score (MDS), which attributes multimodal features to specific modalities by categorizing them as vision-dominant, language-dominant, or cross-modal. We then propose automatic interpretability metrics to evaluate these modality-specific features in a scalable manner. Finally, we demonstrate how the identified modality-specific features enable training-free probing and editing methods for understanding model perception across genders, generating adversarial examples, and controlling text-to-image generation. Combined with task-agnostic interpretability tools, our work provides a systematic framework for analyzing and efficiently controlling multimodal models.
%U https://aclanthology.org/2026.findings-acl.588/
%P 12123-12138
Markdown (Informal)
[Beyond Cross-Modal Alignment: Measuring and Leveraging Modality Gap in Vision-Language Models](https://aclanthology.org/2026.findings-acl.588/) (Yan et al., Findings 2026)
ACL
- Hanqi Yan, Xiangxiang Cui, Lu Yin, Jindong Gu, Paul Pu Liang, Yulan He, and Yifei Wang. 2026. Beyond Cross-Modal Alignment: Measuring and Leveraging Modality Gap in Vision-Language Models. In Findings of the Association for Computational Linguistics: ACL 2026, pages 12123–12138, San Diego, California, United States. Association for Computational Linguistics.