@inproceedings{yu-etal-2026-need,
title = "You Only Need One Single Token to Refine Safety Alignment",
author = "Yu, Wenqian and
Chen, Shuo and
Li, Zhijiang and
Wang, Zhipeng and
Gu, Jindong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.662/",
pages = "13529--13545",
ISBN = "979-8-89176-395-1",
abstract = "Large language models (LLMs) face a critical alignment challenge: balancing safety with helpfulness. Excessive safety can lead to over-refusal, where models reject harmful-looking yet benign queries, severely limiting utility.Existing training-free interventions offer an efficient way to mitigate over-refusal without re-training, but suffer from high inference overhead and architecture dependency. Our work explores a complementary direction: rather than applying post-hoc corrections to model outputs, our goal is to intrinsically reshape the distributions of harmful and benign samples within the model{'}s decision space. In this paper, we argue that a lightweight training-based approach can more effectively distinguish between harmful and benign samples. We propose Single Token Alignment (STA), which optimizes only a single-token prefix (e.g., 4,096 parameters) while keeping the base model frozen. To address the inherent challenge of achieving robust refinement through such a minimal parameter interface, STA employs a mixed weighting mechanism integrated with its optimization objective. This mechanism incorporates hard weighting via stringent data filtering to provide clear, unbiased learning signals, and soft weighting through a focal mechanism to prioritize challenging cases.Extensive experiments across 9 models and 10 datasets demonstrate that STA achieves a superior safety-helpfulness balance for LLMs, MLLMs, and reasoning models, offering a highly efficient and generalizable solution for refining safety alignment."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yu-etal-2026-need">
<titleInfo>
<title>You Only Need One Single Token to Refine Safety Alignment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wenqian</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuo</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhijiang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhipeng</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jindong</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Large language models (LLMs) face a critical alignment challenge: balancing safety with helpfulness. Excessive safety can lead to over-refusal, where models reject harmful-looking yet benign queries, severely limiting utility.Existing training-free interventions offer an efficient way to mitigate over-refusal without re-training, but suffer from high inference overhead and architecture dependency. Our work explores a complementary direction: rather than applying post-hoc corrections to model outputs, our goal is to intrinsically reshape the distributions of harmful and benign samples within the model’s decision space. In this paper, we argue that a lightweight training-based approach can more effectively distinguish between harmful and benign samples. We propose Single Token Alignment (STA), which optimizes only a single-token prefix (e.g., 4,096 parameters) while keeping the base model frozen. To address the inherent challenge of achieving robust refinement through such a minimal parameter interface, STA employs a mixed weighting mechanism integrated with its optimization objective. This mechanism incorporates hard weighting via stringent data filtering to provide clear, unbiased learning signals, and soft weighting through a focal mechanism to prioritize challenging cases.Extensive experiments across 9 models and 10 datasets demonstrate that STA achieves a superior safety-helpfulness balance for LLMs, MLLMs, and reasoning models, offering a highly efficient and generalizable solution for refining safety alignment.</abstract>
<identifier type="citekey">yu-etal-2026-need</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.662/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>13529</start>
<end>13545</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T You Only Need One Single Token to Refine Safety Alignment
%A Yu, Wenqian
%A Chen, Shuo
%A Li, Zhijiang
%A Wang, Zhipeng
%A Gu, Jindong
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F yu-etal-2026-need
%X Large language models (LLMs) face a critical alignment challenge: balancing safety with helpfulness. Excessive safety can lead to over-refusal, where models reject harmful-looking yet benign queries, severely limiting utility.Existing training-free interventions offer an efficient way to mitigate over-refusal without re-training, but suffer from high inference overhead and architecture dependency. Our work explores a complementary direction: rather than applying post-hoc corrections to model outputs, our goal is to intrinsically reshape the distributions of harmful and benign samples within the model’s decision space. In this paper, we argue that a lightweight training-based approach can more effectively distinguish between harmful and benign samples. We propose Single Token Alignment (STA), which optimizes only a single-token prefix (e.g., 4,096 parameters) while keeping the base model frozen. To address the inherent challenge of achieving robust refinement through such a minimal parameter interface, STA employs a mixed weighting mechanism integrated with its optimization objective. This mechanism incorporates hard weighting via stringent data filtering to provide clear, unbiased learning signals, and soft weighting through a focal mechanism to prioritize challenging cases.Extensive experiments across 9 models and 10 datasets demonstrate that STA achieves a superior safety-helpfulness balance for LLMs, MLLMs, and reasoning models, offering a highly efficient and generalizable solution for refining safety alignment.
%U https://aclanthology.org/2026.findings-acl.662/
%P 13529-13545
Markdown (Informal)
[You Only Need One Single Token to Refine Safety Alignment](https://aclanthology.org/2026.findings-acl.662/) (Yu et al., Findings 2026)
ACL
- Wenqian Yu, Shuo Chen, Zhijiang Li, Zhipeng Wang, and Jindong Gu. 2026. You Only Need One Single Token to Refine Safety Alignment. In Findings of the Association for Computational Linguistics: ACL 2026, pages 13529–13545, San Diego, California, United States. Association for Computational Linguistics.