@inproceedings{zhao-etal-2025-zero,
title = "Zero-Shot Defense Against Toxic Images via Inherent Multimodal Alignment in {LVLM}s",
author = "Zhao, Wei and
Li, Zhe and
Li, Yige and
Sun, Jun",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.767/",
doi = "10.18653/v1/2025.findings-emnlp.767",
pages = "14232--14246",
ISBN = "979-8-89176-335-7",
abstract = "Large Vision-Language Models (LVLMs) have made significant strides in multimodal comprehension, thanks to extensive pre-training and fine-tuning on large-scale visual datasets. However, despite their robust textual safety mechanisms, they remain vulnerable to harmful visual inputs. Existing safeguards{---}typically relying on pre-filtering or fine-tuning{---}incur high costs and diminish overall utility. To address this critical vulnerability, we introduce SafeCLIP, a lightweight method that leverages LVLMs' inherent multimodal alignment for zero-shot toxic image detection. By projecting CLIP{'}s discarded CLS token into its text space and matching it with toxic descriptors, SafeCLIP detects harmful content without any architectural changes{---}adding minimal latency and enabling dynamic safety corrections during inference and fine-tuning. Experiments show that SafeCLIP achieves a 66.9{\%} defense success rate with only 3.2{\%} false positive rate and 7.2{\%} overhead. In contrast, state-of-the-art methods achieve 52.9{\%} success but have a 10.7{\%} false positive rate and 210{\%} overhead. Our work demonstrates that leveraging inherent multimodal alignment can yield efficient, low-cost LVLM safety. Code is available at \url{anonymous.4open.science/r/safeclip-2C01}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhao-etal-2025-zero">
<titleInfo>
<title>Zero-Shot Defense Against Toxic Images via Inherent Multimodal Alignment in LVLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhe</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yige</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Large Vision-Language Models (LVLMs) have made significant strides in multimodal comprehension, thanks to extensive pre-training and fine-tuning on large-scale visual datasets. However, despite their robust textual safety mechanisms, they remain vulnerable to harmful visual inputs. Existing safeguards—typically relying on pre-filtering or fine-tuning—incur high costs and diminish overall utility. To address this critical vulnerability, we introduce SafeCLIP, a lightweight method that leverages LVLMs’ inherent multimodal alignment for zero-shot toxic image detection. By projecting CLIP’s discarded CLS token into its text space and matching it with toxic descriptors, SafeCLIP detects harmful content without any architectural changes—adding minimal latency and enabling dynamic safety corrections during inference and fine-tuning. Experiments show that SafeCLIP achieves a 66.9% defense success rate with only 3.2% false positive rate and 7.2% overhead. In contrast, state-of-the-art methods achieve 52.9% success but have a 10.7% false positive rate and 210% overhead. Our work demonstrates that leveraging inherent multimodal alignment can yield efficient, low-cost LVLM safety. Code is available at anonymous.4open.science/r/safeclip-2C01.</abstract>
<identifier type="citekey">zhao-etal-2025-zero</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.767</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.767/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>14232</start>
<end>14246</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Zero-Shot Defense Against Toxic Images via Inherent Multimodal Alignment in LVLMs
%A Zhao, Wei
%A Li, Zhe
%A Li, Yige
%A Sun, Jun
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F zhao-etal-2025-zero
%X Large Vision-Language Models (LVLMs) have made significant strides in multimodal comprehension, thanks to extensive pre-training and fine-tuning on large-scale visual datasets. However, despite their robust textual safety mechanisms, they remain vulnerable to harmful visual inputs. Existing safeguards—typically relying on pre-filtering or fine-tuning—incur high costs and diminish overall utility. To address this critical vulnerability, we introduce SafeCLIP, a lightweight method that leverages LVLMs’ inherent multimodal alignment for zero-shot toxic image detection. By projecting CLIP’s discarded CLS token into its text space and matching it with toxic descriptors, SafeCLIP detects harmful content without any architectural changes—adding minimal latency and enabling dynamic safety corrections during inference and fine-tuning. Experiments show that SafeCLIP achieves a 66.9% defense success rate with only 3.2% false positive rate and 7.2% overhead. In contrast, state-of-the-art methods achieve 52.9% success but have a 10.7% false positive rate and 210% overhead. Our work demonstrates that leveraging inherent multimodal alignment can yield efficient, low-cost LVLM safety. Code is available at anonymous.4open.science/r/safeclip-2C01.
%R 10.18653/v1/2025.findings-emnlp.767
%U https://aclanthology.org/2025.findings-emnlp.767/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.767
%P 14232-14246
Markdown (Informal)
[Zero-Shot Defense Against Toxic Images via Inherent Multimodal Alignment in LVLMs](https://aclanthology.org/2025.findings-emnlp.767/) (Zhao et al., Findings 2025)
ACL