@inproceedings{zeng-etal-2026-safesteer,
title = "{S}afe{S}teer: A Decoding-level Defense Mechanism for Multimodal Large Language Models",
author = "Zeng, Xinyi and
Yang, Xue and
Zhang, Jingyuan and
Yan, Huanqian and
Chen, Xiang and
Wei, Kaiwen and
Kang, Hankun and
Tian, Yu",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.916/",
pages = "18411--18422",
ISBN = "979-8-89176-395-1",
abstract = "Multimodal large language models (MLLMs) are gaining increasing attention. Due to the heterogeneity of their input features, they face significant challenges in terms of jailbreak defenses. Current defense methods rely on costly fine-tuning or inefficient post-hoc interventions, limiting their ability to address novel attacks and involving performance trade-offs. To address the above issues, we explore the endogenous safety capabilities within MLLMs and quantify their intrinsic ability to discern harmfulness at both encoding and decoding stages. We observe that 1) MLLMs can distinguish the harmful and harmless inputs during decoding process, 2) Image-based attacks are more stealthy. Based on these insights, we introduce SafeSteer, a decoding-level defense mechanism for MLLMs. Specifically, it employs a lightweight discriminator, based on the MLLM{'}s own discriminative ability, to iteratively steer the decoding process toward safety. A safety alignment vector is also integrated to handle complex multimodal threats. Experiments on multiple MLLMs demonstrate that our proposed method can improve safety performance by up to 33.40{\%} without fine-tuning."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zeng-etal-2026-safesteer">
<titleInfo>
<title>SafeSteer: A Decoding-level Defense Mechanism for Multimodal Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xinyi</namePart>
<namePart type="family">Zeng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xue</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingyuan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huanqian</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaiwen</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hankun</namePart>
<namePart type="family">Kang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Multimodal large language models (MLLMs) are gaining increasing attention. Due to the heterogeneity of their input features, they face significant challenges in terms of jailbreak defenses. Current defense methods rely on costly fine-tuning or inefficient post-hoc interventions, limiting their ability to address novel attacks and involving performance trade-offs. To address the above issues, we explore the endogenous safety capabilities within MLLMs and quantify their intrinsic ability to discern harmfulness at both encoding and decoding stages. We observe that 1) MLLMs can distinguish the harmful and harmless inputs during decoding process, 2) Image-based attacks are more stealthy. Based on these insights, we introduce SafeSteer, a decoding-level defense mechanism for MLLMs. Specifically, it employs a lightweight discriminator, based on the MLLM’s own discriminative ability, to iteratively steer the decoding process toward safety. A safety alignment vector is also integrated to handle complex multimodal threats. Experiments on multiple MLLMs demonstrate that our proposed method can improve safety performance by up to 33.40% without fine-tuning.</abstract>
<identifier type="citekey">zeng-etal-2026-safesteer</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.916/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>18411</start>
<end>18422</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SafeSteer: A Decoding-level Defense Mechanism for Multimodal Large Language Models
%A Zeng, Xinyi
%A Yang, Xue
%A Zhang, Jingyuan
%A Yan, Huanqian
%A Chen, Xiang
%A Wei, Kaiwen
%A Kang, Hankun
%A Tian, Yu
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F zeng-etal-2026-safesteer
%X Multimodal large language models (MLLMs) are gaining increasing attention. Due to the heterogeneity of their input features, they face significant challenges in terms of jailbreak defenses. Current defense methods rely on costly fine-tuning or inefficient post-hoc interventions, limiting their ability to address novel attacks and involving performance trade-offs. To address the above issues, we explore the endogenous safety capabilities within MLLMs and quantify their intrinsic ability to discern harmfulness at both encoding and decoding stages. We observe that 1) MLLMs can distinguish the harmful and harmless inputs during decoding process, 2) Image-based attacks are more stealthy. Based on these insights, we introduce SafeSteer, a decoding-level defense mechanism for MLLMs. Specifically, it employs a lightweight discriminator, based on the MLLM’s own discriminative ability, to iteratively steer the decoding process toward safety. A safety alignment vector is also integrated to handle complex multimodal threats. Experiments on multiple MLLMs demonstrate that our proposed method can improve safety performance by up to 33.40% without fine-tuning.
%U https://aclanthology.org/2026.findings-acl.916/
%P 18411-18422
Markdown (Informal)
[SafeSteer: A Decoding-level Defense Mechanism for Multimodal Large Language Models](https://aclanthology.org/2026.findings-acl.916/) (Zeng et al., Findings 2026)
ACL
- Xinyi Zeng, Xue Yang, Jingyuan Zhang, Huanqian Yan, Xiang Chen, Kaiwen Wei, Hankun Kang, and Yu Tian. 2026. SafeSteer: A Decoding-level Defense Mechanism for Multimodal Large Language Models. In Findings of the Association for Computational Linguistics: ACL 2026, pages 18411–18422, San Diego, California, United States. Association for Computational Linguistics.