@inproceedings{galichin-etal-2026-feature,
title = "Feature Drift: How Fine-Tuning Repurposes Representations in {LLM}s",
author = "Galichin, Andrey V. and
Korznikov, Anton and
Dontsov, Alexey and
Rogov, Oleg and
Tutubalina, Elena and
Oseledets, Ivan",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.96/",
pages = "1878--1887",
ISBN = "979-8-89176-386-9",
abstract = "Fine-tuning LLMs introduces many important behaviors, such as instruction-following and safety alignment. This makes it crucial to study how fine-tuning changes models' internal mechanisms. Sparse Autoencoders (SAEs) offer a powerful tool for interpreting neural networks by extracting concepts (features) represented in their activations. Previous work observed that SAEs trained on base models transfer effectively to instruction-tuned (chat) models, attributed to activation similarity. In this work, we propose *feature drift* as an alternative explanation: the feature space remains relevant, but the distribution of feature activations changes. In other words, fine-tuning recombines existing concepts rather than learning new ones. We validate this by showing base SAEs reconstruct both base and chat activations comparably despite systematic differences, with individual features exhibiting clear drift patterns. In a refusal behavior case study, we identify base SAE features that drift to activate on harmful instructions in chat models. Causal interventions using these features confirm that they mediate refusal. Our findings suggest that monitoring how existing features drift, rather than searching for entirely new features, may provide a more complete explanation of how fine-tuning changes model capabilities."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="galichin-etal-2026-feature">
<titleInfo>
<title>Feature Drift: How Fine-Tuning Repurposes Representations in LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andrey</namePart>
<namePart type="given">V</namePart>
<namePart type="family">Galichin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anton</namePart>
<namePart type="family">Korznikov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexey</namePart>
<namePart type="family">Dontsov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Rogov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Tutubalina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Oseledets</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>Fine-tuning LLMs introduces many important behaviors, such as instruction-following and safety alignment. This makes it crucial to study how fine-tuning changes models’ internal mechanisms. Sparse Autoencoders (SAEs) offer a powerful tool for interpreting neural networks by extracting concepts (features) represented in their activations. Previous work observed that SAEs trained on base models transfer effectively to instruction-tuned (chat) models, attributed to activation similarity. In this work, we propose *feature drift* as an alternative explanation: the feature space remains relevant, but the distribution of feature activations changes. In other words, fine-tuning recombines existing concepts rather than learning new ones. We validate this by showing base SAEs reconstruct both base and chat activations comparably despite systematic differences, with individual features exhibiting clear drift patterns. In a refusal behavior case study, we identify base SAE features that drift to activate on harmful instructions in chat models. Causal interventions using these features confirm that they mediate refusal. Our findings suggest that monitoring how existing features drift, rather than searching for entirely new features, may provide a more complete explanation of how fine-tuning changes model capabilities.</abstract>
<identifier type="citekey">galichin-etal-2026-feature</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.96/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>1878</start>
<end>1887</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Feature Drift: How Fine-Tuning Repurposes Representations in LLMs
%A Galichin, Andrey V.
%A Korznikov, Anton
%A Dontsov, Alexey
%A Rogov, Oleg
%A Tutubalina, Elena
%A Oseledets, Ivan
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F galichin-etal-2026-feature
%X Fine-tuning LLMs introduces many important behaviors, such as instruction-following and safety alignment. This makes it crucial to study how fine-tuning changes models’ internal mechanisms. Sparse Autoencoders (SAEs) offer a powerful tool for interpreting neural networks by extracting concepts (features) represented in their activations. Previous work observed that SAEs trained on base models transfer effectively to instruction-tuned (chat) models, attributed to activation similarity. In this work, we propose *feature drift* as an alternative explanation: the feature space remains relevant, but the distribution of feature activations changes. In other words, fine-tuning recombines existing concepts rather than learning new ones. We validate this by showing base SAEs reconstruct both base and chat activations comparably despite systematic differences, with individual features exhibiting clear drift patterns. In a refusal behavior case study, we identify base SAE features that drift to activate on harmful instructions in chat models. Causal interventions using these features confirm that they mediate refusal. Our findings suggest that monitoring how existing features drift, rather than searching for entirely new features, may provide a more complete explanation of how fine-tuning changes model capabilities.
%U https://aclanthology.org/2026.findings-eacl.96/
%P 1878-1887
Markdown (Informal)
[Feature Drift: How Fine-Tuning Repurposes Representations in LLMs](https://aclanthology.org/2026.findings-eacl.96/) (Galichin et al., Findings 2026)
ACL