@inproceedings{minegishi-etal-2026-understanding,
title = "Understanding Emergent Misalignment via Feature Superposition Geometry",
author = "Minegishi, Gouki and
Furuta, Hiroki and
Kojima, Takeshi and
Iwasawa, Yusuke and
Matsuo, Yutaka",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1402/",
pages = "30385--30414",
ISBN = "979-8-89176-390-6",
abstract = "Emergent misalignment, where fine-tuning on narrow, non-harmful tasks induces harmful behaviors, poses a key challenge for AI safety in LLMs. Despite growing empirical evidence, its underlying mechanism remains unclear. To uncover the reason behind this phenomenon, we propose a mechanistic account based on the geometry of feature superposition. Because features are encoded in overlapping, fine-tuning that amplifies a target feature also unintentionally strengthens nearby harmful features in accordance with their similarity. We give a simple gradient-level derivation of this mechanism and empirically test it across multiple LLMs (Gemma-2 2B/9B/27B, LLaMA-3.1 8B, gpt-oss 20B). Using sparse autoencoders (SAEs), we identify features tied to misalignment-inducing data and to harmful behaviors, and show that they are geometrically closer to each other than features derived from non-inducing data. This trend generalizes across domains (e.g., health, career, legal advice). Finally, we show that a geometry-aware approach{---}filtering training samples nearest to toxic features{---}reduces misalignment by 34.5{\%}, substantially outperforming random removal and achieving stronger mitigation than LLM-as-a-judge{--}based filtering. Our study explains emergent misalignment through feature superposition, providing a basis for understanding and mitigating this phenomenon."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="minegishi-etal-2026-understanding">
<titleInfo>
<title>Understanding Emergent Misalignment via Feature Superposition Geometry</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gouki</namePart>
<namePart type="family">Minegishi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroki</namePart>
<namePart type="family">Furuta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Takeshi</namePart>
<namePart type="family">Kojima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yusuke</namePart>
<namePart type="family">Iwasawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yutaka</namePart>
<namePart type="family">Matsuo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Emergent misalignment, where fine-tuning on narrow, non-harmful tasks induces harmful behaviors, poses a key challenge for AI safety in LLMs. Despite growing empirical evidence, its underlying mechanism remains unclear. To uncover the reason behind this phenomenon, we propose a mechanistic account based on the geometry of feature superposition. Because features are encoded in overlapping, fine-tuning that amplifies a target feature also unintentionally strengthens nearby harmful features in accordance with their similarity. We give a simple gradient-level derivation of this mechanism and empirically test it across multiple LLMs (Gemma-2 2B/9B/27B, LLaMA-3.1 8B, gpt-oss 20B). Using sparse autoencoders (SAEs), we identify features tied to misalignment-inducing data and to harmful behaviors, and show that they are geometrically closer to each other than features derived from non-inducing data. This trend generalizes across domains (e.g., health, career, legal advice). Finally, we show that a geometry-aware approach—filtering training samples nearest to toxic features—reduces misalignment by 34.5%, substantially outperforming random removal and achieving stronger mitigation than LLM-as-a-judge–based filtering. Our study explains emergent misalignment through feature superposition, providing a basis for understanding and mitigating this phenomenon.</abstract>
<identifier type="citekey">minegishi-etal-2026-understanding</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1402/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>30385</start>
<end>30414</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Understanding Emergent Misalignment via Feature Superposition Geometry
%A Minegishi, Gouki
%A Furuta, Hiroki
%A Kojima, Takeshi
%A Iwasawa, Yusuke
%A Matsuo, Yutaka
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F minegishi-etal-2026-understanding
%X Emergent misalignment, where fine-tuning on narrow, non-harmful tasks induces harmful behaviors, poses a key challenge for AI safety in LLMs. Despite growing empirical evidence, its underlying mechanism remains unclear. To uncover the reason behind this phenomenon, we propose a mechanistic account based on the geometry of feature superposition. Because features are encoded in overlapping, fine-tuning that amplifies a target feature also unintentionally strengthens nearby harmful features in accordance with their similarity. We give a simple gradient-level derivation of this mechanism and empirically test it across multiple LLMs (Gemma-2 2B/9B/27B, LLaMA-3.1 8B, gpt-oss 20B). Using sparse autoencoders (SAEs), we identify features tied to misalignment-inducing data and to harmful behaviors, and show that they are geometrically closer to each other than features derived from non-inducing data. This trend generalizes across domains (e.g., health, career, legal advice). Finally, we show that a geometry-aware approach—filtering training samples nearest to toxic features—reduces misalignment by 34.5%, substantially outperforming random removal and achieving stronger mitigation than LLM-as-a-judge–based filtering. Our study explains emergent misalignment through feature superposition, providing a basis for understanding and mitigating this phenomenon.
%U https://aclanthology.org/2026.acl-long.1402/
%P 30385-30414
Markdown (Informal)
[Understanding Emergent Misalignment via Feature Superposition Geometry](https://aclanthology.org/2026.acl-long.1402/) (Minegishi et al., ACL 2026)
ACL
- Gouki Minegishi, Hiroki Furuta, Takeshi Kojima, Yusuke Iwasawa, and Yutaka Matsuo. 2026. Understanding Emergent Misalignment via Feature Superposition Geometry. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 30385–30414, San Diego, California, United States. Association for Computational Linguistics.