@inproceedings{mei-etal-2026-hiddenguard,
title = "{H}idden{G}uard: Fine-Grained Safe Generation with Specialized Representation Router",
author = "Mei, Lingrui and
Liu, Shenghua and
Wang, Yiwei and
Bi, Baolong and
Yuan, Ruibin and
Cheng, Xueqi",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1482/",
pages = "32124--32156",
ISBN = "979-8-89176-390-6",
abstract = "As Large Language Models (LLMs) grow increasingly powerful, ensuring their safety and alignment with human values remains a critical challenge. Current alignment approaches predominantly rely on refusal alignment, such as training models to refuse harmful prompts or implementing filters at various stages to block certain responses. These methods are designed toward a binary outcome: either denying to answer the question entirely or answering with full access to the model{'}s parametric knowledge. The binary nature of current alignment approaches presents significant limitations. These methods often fail to balance safety and utility, resulting in either overly cautious responses or overlooking subtle harmful content. They also prevent users from accessing benign information when it{'}s mixed with harmful content. For instance, a model might refuse to provide basic, public information about a medication{'}s composition due to misuse concerns. Furthermore, these approaches struggle with context-dependent sensitivity, potentially over-censoring harmless content or missing nuanced harmful outputs. Ideally, LLMs should offer informative responses while avoiding the disclosure of harmful and sensitive information. To address these challenges, we introduce HiddenGuard, a novel framework for fine-grained safe generation in LLMs. Our method incorporates PRISM (rePresentation Router for In-Stream Moderation), a specialized moudule that operates alongside the LLM architecture. By leveraging intermediate hidden states, HiddenGuard enables real-time, token-level harmfulness detection and redaction, without loss in capability. This approach captures deeper semantic information, allowing for more nuanced and context-aware content control compared to traditional filtering techniques. Consequently, the model can generate informative responses while selectively redacting or replacing sensitive information, rather than refusing to answer outright. We also contribute a comprehensive dataset with token-level fine-grained annotations of potentially harmful information across diverse contexts. Our experiments demonstrate that HiddenGuard achieves over 90{\%} in F1 score for detecting and redacting harmful content while preserving the overall utility and informativeness of the model{'}s responses."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mei-etal-2026-hiddenguard">
<titleInfo>
<title>HiddenGuard: Fine-Grained Safe Generation with Specialized Representation Router</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lingrui</namePart>
<namePart type="family">Mei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shenghua</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yiwei</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Baolong</namePart>
<namePart type="family">Bi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruibin</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xueqi</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>As Large Language Models (LLMs) grow increasingly powerful, ensuring their safety and alignment with human values remains a critical challenge. Current alignment approaches predominantly rely on refusal alignment, such as training models to refuse harmful prompts or implementing filters at various stages to block certain responses. These methods are designed toward a binary outcome: either denying to answer the question entirely or answering with full access to the model’s parametric knowledge. The binary nature of current alignment approaches presents significant limitations. These methods often fail to balance safety and utility, resulting in either overly cautious responses or overlooking subtle harmful content. They also prevent users from accessing benign information when it’s mixed with harmful content. For instance, a model might refuse to provide basic, public information about a medication’s composition due to misuse concerns. Furthermore, these approaches struggle with context-dependent sensitivity, potentially over-censoring harmless content or missing nuanced harmful outputs. Ideally, LLMs should offer informative responses while avoiding the disclosure of harmful and sensitive information. To address these challenges, we introduce HiddenGuard, a novel framework for fine-grained safe generation in LLMs. Our method incorporates PRISM (rePresentation Router for In-Stream Moderation), a specialized moudule that operates alongside the LLM architecture. By leveraging intermediate hidden states, HiddenGuard enables real-time, token-level harmfulness detection and redaction, without loss in capability. This approach captures deeper semantic information, allowing for more nuanced and context-aware content control compared to traditional filtering techniques. Consequently, the model can generate informative responses while selectively redacting or replacing sensitive information, rather than refusing to answer outright. We also contribute a comprehensive dataset with token-level fine-grained annotations of potentially harmful information across diverse contexts. Our experiments demonstrate that HiddenGuard achieves over 90% in F1 score for detecting and redacting harmful content while preserving the overall utility and informativeness of the model’s responses.</abstract>
<identifier type="citekey">mei-etal-2026-hiddenguard</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1482/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>32124</start>
<end>32156</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HiddenGuard: Fine-Grained Safe Generation with Specialized Representation Router
%A Mei, Lingrui
%A Liu, Shenghua
%A Wang, Yiwei
%A Bi, Baolong
%A Yuan, Ruibin
%A Cheng, Xueqi
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F mei-etal-2026-hiddenguard
%X As Large Language Models (LLMs) grow increasingly powerful, ensuring their safety and alignment with human values remains a critical challenge. Current alignment approaches predominantly rely on refusal alignment, such as training models to refuse harmful prompts or implementing filters at various stages to block certain responses. These methods are designed toward a binary outcome: either denying to answer the question entirely or answering with full access to the model’s parametric knowledge. The binary nature of current alignment approaches presents significant limitations. These methods often fail to balance safety and utility, resulting in either overly cautious responses or overlooking subtle harmful content. They also prevent users from accessing benign information when it’s mixed with harmful content. For instance, a model might refuse to provide basic, public information about a medication’s composition due to misuse concerns. Furthermore, these approaches struggle with context-dependent sensitivity, potentially over-censoring harmless content or missing nuanced harmful outputs. Ideally, LLMs should offer informative responses while avoiding the disclosure of harmful and sensitive information. To address these challenges, we introduce HiddenGuard, a novel framework for fine-grained safe generation in LLMs. Our method incorporates PRISM (rePresentation Router for In-Stream Moderation), a specialized moudule that operates alongside the LLM architecture. By leveraging intermediate hidden states, HiddenGuard enables real-time, token-level harmfulness detection and redaction, without loss in capability. This approach captures deeper semantic information, allowing for more nuanced and context-aware content control compared to traditional filtering techniques. Consequently, the model can generate informative responses while selectively redacting or replacing sensitive information, rather than refusing to answer outright. We also contribute a comprehensive dataset with token-level fine-grained annotations of potentially harmful information across diverse contexts. Our experiments demonstrate that HiddenGuard achieves over 90% in F1 score for detecting and redacting harmful content while preserving the overall utility and informativeness of the model’s responses.
%U https://aclanthology.org/2026.acl-long.1482/
%P 32124-32156
Markdown (Informal)
[HiddenGuard: Fine-Grained Safe Generation with Specialized Representation Router](https://aclanthology.org/2026.acl-long.1482/) (Mei et al., ACL 2026)
ACL