@inproceedings{siu-etal-2025-cosmic,
title = "{COSMIC}: Generalized Refusal Direction Identification in {LLM} Activations",
author = "Siu, Vincent and
Crispino, Nicholas and
Yu, Zihao and
Pan, Sam and
Wang, Zhun and
Liu, Yang and
Song, Dawn and
Wang, Chenguang",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1310/",
doi = "10.18653/v1/2025.findings-acl.1310",
pages = "25534--25553",
ISBN = "979-8-89176-256-5",
abstract = "Large Language Models encode behaviors like refusal within their activation space, but identifying these behaviors remains challenging. Existing methods depend on predefined refusal templates detectable in output tokens or manual review. We introduce **COSMIC** (Cosine Similarity Metrics for Inversion of Concepts), an automated framework for direction selection that optimally identifies steering directions and target layers using cosine similarity, entirely independent of output text. COSMIC achieves steering effectiveness comparable to prior work without any prior knowledge or assumptions of a model{'}s refusal behavior such as the use of certain refusal tokens. Additionally, COSMIC successfully identifies refusal directions in adversarial scenarios and models with weak safety alignment, demonstrating its robustness across diverse settings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="siu-etal-2025-cosmic">
<titleInfo>
<title>COSMIC: Generalized Refusal Direction Identification in LLM Activations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vincent</namePart>
<namePart type="family">Siu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Crispino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zihao</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sam</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhun</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dawn</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenguang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Large Language Models encode behaviors like refusal within their activation space, but identifying these behaviors remains challenging. Existing methods depend on predefined refusal templates detectable in output tokens or manual review. We introduce **COSMIC** (Cosine Similarity Metrics for Inversion of Concepts), an automated framework for direction selection that optimally identifies steering directions and target layers using cosine similarity, entirely independent of output text. COSMIC achieves steering effectiveness comparable to prior work without any prior knowledge or assumptions of a model’s refusal behavior such as the use of certain refusal tokens. Additionally, COSMIC successfully identifies refusal directions in adversarial scenarios and models with weak safety alignment, demonstrating its robustness across diverse settings.</abstract>
<identifier type="citekey">siu-etal-2025-cosmic</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1310</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1310/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>25534</start>
<end>25553</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T COSMIC: Generalized Refusal Direction Identification in LLM Activations
%A Siu, Vincent
%A Crispino, Nicholas
%A Yu, Zihao
%A Pan, Sam
%A Wang, Zhun
%A Liu, Yang
%A Song, Dawn
%A Wang, Chenguang
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F siu-etal-2025-cosmic
%X Large Language Models encode behaviors like refusal within their activation space, but identifying these behaviors remains challenging. Existing methods depend on predefined refusal templates detectable in output tokens or manual review. We introduce **COSMIC** (Cosine Similarity Metrics for Inversion of Concepts), an automated framework for direction selection that optimally identifies steering directions and target layers using cosine similarity, entirely independent of output text. COSMIC achieves steering effectiveness comparable to prior work without any prior knowledge or assumptions of a model’s refusal behavior such as the use of certain refusal tokens. Additionally, COSMIC successfully identifies refusal directions in adversarial scenarios and models with weak safety alignment, demonstrating its robustness across diverse settings.
%R 10.18653/v1/2025.findings-acl.1310
%U https://aclanthology.org/2025.findings-acl.1310/
%U https://doi.org/10.18653/v1/2025.findings-acl.1310
%P 25534-25553
Markdown (Informal)
[COSMIC: Generalized Refusal Direction Identification in LLM Activations](https://aclanthology.org/2025.findings-acl.1310/) (Siu et al., Findings 2025)
ACL
- Vincent Siu, Nicholas Crispino, Zihao Yu, Sam Pan, Zhun Wang, Yang Liu, Dawn Song, and Chenguang Wang. 2025. COSMIC: Generalized Refusal Direction Identification in LLM Activations. In Findings of the Association for Computational Linguistics: ACL 2025, pages 25534–25553, Vienna, Austria. Association for Computational Linguistics.