@inproceedings{xue-etal-2026-deactivating,
title = "Deactivating Refusal Triggers: Understanding and Mitigating Overrefusal in Safety Alignment",
author = "Xue, Zhiyu and
Qi, Zimo and
Liu, Guangliang and
Chen, Bocheng and
Pedarsani, Ramtin",
editor = "Chang, Kai-Wei and
Mehrabi, Ninareh and
Krishna, Satyapriya and
Das, Anubrata and
Dhamala, Jwala and
Cao, Yang Trista and
Kumarage, Tharindu and
Ramakrishna, Anil and
Christodoulopoulos, Christos and
Wan, Yixin and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 6th Workshop on Trustworthy {NLP} ({T}rust{NLP} 2026)",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.trustnlp-main.26/",
pages = "402--412",
ISBN = "979-8-89176-418-7",
abstract = "Safety alignment aims to ensure that large language models (LLMs) refuse harmful requests by post-training on harmful queries paired with refusal answers.Although safety alignment is widely adopted in industry, the overrefusal problem where aligned LLMs also reject benign queries after safety alignment post-training, remains insufficiently studied. Such an issue degrades the usability of safety alignment in real-world applications.In this paper, we examine how overrefusal arises under safety alignment, and propose a mitigation strategy inspired by our findings. We define refusal triggers as linguistic cues in the training data that elicit refusal responses, safety alignment encourages LLMs to associate refusal triggers within a training sample with refusal responses, leading aligned LLMs to refuse harmful queries.However, the refusal triggers include not only harmful linguistic cues but also non-harmful cues, therefore causing overrefusal to benign queries.Building on this mechanistic analysis, we propose a method that explicitly considers refusal triggers in the safety alignment fine-tuning.Empirical results demonstrate that our approach achieves a more favorable trade-off between defense against jailbreak attacks and responsiveness to benign queries, outperforming prior methods. Warning: this paper contains harmful and biased sentences."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xue-etal-2026-deactivating">
<titleInfo>
<title>Deactivating Refusal Triggers: Understanding and Mitigating Overrefusal in Safety Alignment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhiyu</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zimo</namePart>
<namePart type="family">Qi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guangliang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bocheng</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramtin</namePart>
<namePart type="family">Pedarsani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th Workshop on Trustworthy NLP (TrustNLP 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ninareh</namePart>
<namePart type="family">Mehrabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Satyapriya</namePart>
<namePart type="family">Krishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anubrata</namePart>
<namePart type="family">Das</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jwala</namePart>
<namePart type="family">Dhamala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="given">Trista</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Kumarage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anil</namePart>
<namePart type="family">Ramakrishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yixin</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aram</namePart>
<namePart type="family">Galystan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-418-7</identifier>
</relatedItem>
<abstract>Safety alignment aims to ensure that large language models (LLMs) refuse harmful requests by post-training on harmful queries paired with refusal answers.Although safety alignment is widely adopted in industry, the overrefusal problem where aligned LLMs also reject benign queries after safety alignment post-training, remains insufficiently studied. Such an issue degrades the usability of safety alignment in real-world applications.In this paper, we examine how overrefusal arises under safety alignment, and propose a mitigation strategy inspired by our findings. We define refusal triggers as linguistic cues in the training data that elicit refusal responses, safety alignment encourages LLMs to associate refusal triggers within a training sample with refusal responses, leading aligned LLMs to refuse harmful queries.However, the refusal triggers include not only harmful linguistic cues but also non-harmful cues, therefore causing overrefusal to benign queries.Building on this mechanistic analysis, we propose a method that explicitly considers refusal triggers in the safety alignment fine-tuning.Empirical results demonstrate that our approach achieves a more favorable trade-off between defense against jailbreak attacks and responsiveness to benign queries, outperforming prior methods. Warning: this paper contains harmful and biased sentences.</abstract>
<identifier type="citekey">xue-etal-2026-deactivating</identifier>
<location>
<url>https://aclanthology.org/2026.trustnlp-main.26/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>402</start>
<end>412</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Deactivating Refusal Triggers: Understanding and Mitigating Overrefusal in Safety Alignment
%A Xue, Zhiyu
%A Qi, Zimo
%A Liu, Guangliang
%A Chen, Bocheng
%A Pedarsani, Ramtin
%Y Chang, Kai-Wei
%Y Mehrabi, Ninareh
%Y Krishna, Satyapriya
%Y Das, Anubrata
%Y Dhamala, Jwala
%Y Cao, Yang Trista
%Y Kumarage, Tharindu
%Y Ramakrishna, Anil
%Y Christodoulopoulos, Christos
%Y Wan, Yixin
%Y Galystan, Aram
%Y Kumar, Anoop
%Y Gupta, Rahul
%S Proceedings of the 6th Workshop on Trustworthy NLP (TrustNLP 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-418-7
%F xue-etal-2026-deactivating
%X Safety alignment aims to ensure that large language models (LLMs) refuse harmful requests by post-training on harmful queries paired with refusal answers.Although safety alignment is widely adopted in industry, the overrefusal problem where aligned LLMs also reject benign queries after safety alignment post-training, remains insufficiently studied. Such an issue degrades the usability of safety alignment in real-world applications.In this paper, we examine how overrefusal arises under safety alignment, and propose a mitigation strategy inspired by our findings. We define refusal triggers as linguistic cues in the training data that elicit refusal responses, safety alignment encourages LLMs to associate refusal triggers within a training sample with refusal responses, leading aligned LLMs to refuse harmful queries.However, the refusal triggers include not only harmful linguistic cues but also non-harmful cues, therefore causing overrefusal to benign queries.Building on this mechanistic analysis, we propose a method that explicitly considers refusal triggers in the safety alignment fine-tuning.Empirical results demonstrate that our approach achieves a more favorable trade-off between defense against jailbreak attacks and responsiveness to benign queries, outperforming prior methods. Warning: this paper contains harmful and biased sentences.
%U https://aclanthology.org/2026.trustnlp-main.26/
%P 402-412
Markdown (Informal)
[Deactivating Refusal Triggers: Understanding and Mitigating Overrefusal in Safety Alignment](https://aclanthology.org/2026.trustnlp-main.26/) (Xue et al., TrustNLP 2026)
ACL