@inproceedings{kavuri-etal-2025-freeze,
title = "Freeze and Reveal: Exposing Modality Bias in Vision-Language Models",
author = "Kavuri, Vivek Hruday and
Karanam, Vysishtya Karanam and
Jahnavi, Venkamsetty Venkata and
Madumadukala, Kriti and
Darur, Balaji Lakshmipathi and
Kumaraguru, Ponnurangam",
editor = "Przyby{\l}a, Piotr and
Shardlow, Matthew and
Colombatto, Clara and
Inie, Nanna",
booktitle = "Proceedings of Interdisciplinary Workshop on Observations of Misunderstood, Misguided and Malicious Use of Language Models",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ommm-1.2/",
pages = "17--26",
abstract = "Vision-Language Models (VLMs) achieve impressive multimodal performance but often inherit gender biases from their training data. This bias might be coming from both the vision and text modalities. In this work, we dissect the contributions of vision and text backbones to these biases by applying targeted debiasing{---}Counterfactual Data Augmentation (CDA) and Task Vector methods. Inspired by data-efficient approaches in hate speech classification, we introduce a novel metric, Degree of Stereotypicality (DoS), and a corresponding debiasing method, Data Augmentation Using DoS (DAUDoS), to reduce bias with minimal computational cost. We curate a gender-annotated dataset and evaluate all methods on the VisoGender benchmark to quantify improvements and identify the dominant source of bias. Our results show that CDA reduces the gender gap by 6{\%} and DAUDoS by 3{\%} but using only one{-}third the data. Both methods also improve the model{'}s ability to correctly identify gender in images by 3{\%}, with DAUDoS achieving this improvement using only almost one-third of training data. From our experiments, we observed that CLIP{'}s vision encoder is more biased whereas PaliGemma2{'}s text encoder is more biased. By identifying whether the bias stems more from the vision or text encoders, our work enables more targeted and effective bias mitigation strategies in future multi-modal systems."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kavuri-etal-2025-freeze">
<titleInfo>
<title>Freeze and Reveal: Exposing Modality Bias in Vision-Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="given">Hruday</namePart>
<namePart type="family">Kavuri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vysishtya</namePart>
<namePart type="given">Karanam</namePart>
<namePart type="family">Karanam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Venkamsetty</namePart>
<namePart type="given">Venkata</namePart>
<namePart type="family">Jahnavi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kriti</namePart>
<namePart type="family">Madumadukala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Balaji</namePart>
<namePart type="given">Lakshmipathi</namePart>
<namePart type="family">Darur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ponnurangam</namePart>
<namePart type="family">Kumaraguru</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Interdisciplinary Workshop on Observations of Misunderstood, Misguided and Malicious Use of Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Piotr</namePart>
<namePart type="family">Przybyła</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Shardlow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clara</namePart>
<namePart type="family">Colombatto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nanna</namePart>
<namePart type="family">Inie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Vision-Language Models (VLMs) achieve impressive multimodal performance but often inherit gender biases from their training data. This bias might be coming from both the vision and text modalities. In this work, we dissect the contributions of vision and text backbones to these biases by applying targeted debiasing—Counterfactual Data Augmentation (CDA) and Task Vector methods. Inspired by data-efficient approaches in hate speech classification, we introduce a novel metric, Degree of Stereotypicality (DoS), and a corresponding debiasing method, Data Augmentation Using DoS (DAUDoS), to reduce bias with minimal computational cost. We curate a gender-annotated dataset and evaluate all methods on the VisoGender benchmark to quantify improvements and identify the dominant source of bias. Our results show that CDA reduces the gender gap by 6% and DAUDoS by 3% but using only one-third the data. Both methods also improve the model’s ability to correctly identify gender in images by 3%, with DAUDoS achieving this improvement using only almost one-third of training data. From our experiments, we observed that CLIP’s vision encoder is more biased whereas PaliGemma2’s text encoder is more biased. By identifying whether the bias stems more from the vision or text encoders, our work enables more targeted and effective bias mitigation strategies in future multi-modal systems.</abstract>
<identifier type="citekey">kavuri-etal-2025-freeze</identifier>
<location>
<url>https://aclanthology.org/2025.ommm-1.2/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>17</start>
<end>26</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Freeze and Reveal: Exposing Modality Bias in Vision-Language Models
%A Kavuri, Vivek Hruday
%A Karanam, Vysishtya Karanam
%A Jahnavi, Venkamsetty Venkata
%A Madumadukala, Kriti
%A Darur, Balaji Lakshmipathi
%A Kumaraguru, Ponnurangam
%Y Przybyła, Piotr
%Y Shardlow, Matthew
%Y Colombatto, Clara
%Y Inie, Nanna
%S Proceedings of Interdisciplinary Workshop on Observations of Misunderstood, Misguided and Malicious Use of Language Models
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F kavuri-etal-2025-freeze
%X Vision-Language Models (VLMs) achieve impressive multimodal performance but often inherit gender biases from their training data. This bias might be coming from both the vision and text modalities. In this work, we dissect the contributions of vision and text backbones to these biases by applying targeted debiasing—Counterfactual Data Augmentation (CDA) and Task Vector methods. Inspired by data-efficient approaches in hate speech classification, we introduce a novel metric, Degree of Stereotypicality (DoS), and a corresponding debiasing method, Data Augmentation Using DoS (DAUDoS), to reduce bias with minimal computational cost. We curate a gender-annotated dataset and evaluate all methods on the VisoGender benchmark to quantify improvements and identify the dominant source of bias. Our results show that CDA reduces the gender gap by 6% and DAUDoS by 3% but using only one-third the data. Both methods also improve the model’s ability to correctly identify gender in images by 3%, with DAUDoS achieving this improvement using only almost one-third of training data. From our experiments, we observed that CLIP’s vision encoder is more biased whereas PaliGemma2’s text encoder is more biased. By identifying whether the bias stems more from the vision or text encoders, our work enables more targeted and effective bias mitigation strategies in future multi-modal systems.
%U https://aclanthology.org/2025.ommm-1.2/
%P 17-26
Markdown (Informal)
[Freeze and Reveal: Exposing Modality Bias in Vision-Language Models](https://aclanthology.org/2025.ommm-1.2/) (Kavuri et al., OMMM 2025)
ACL
- Vivek Hruday Kavuri, Vysishtya Karanam Karanam, Venkamsetty Venkata Jahnavi, Kriti Madumadukala, Balaji Lakshmipathi Darur, and Ponnurangam Kumaraguru. 2025. Freeze and Reveal: Exposing Modality Bias in Vision-Language Models. In Proceedings of Interdisciplinary Workshop on Observations of Misunderstood, Misguided and Malicious Use of Language Models, pages 17–26, Varna, Bulgaria. INCOMA Ltd., Shoumen, Bulgaria.