@inproceedings{olmo-etal-2025-features,
title = "Features that Make a Difference: Leveraging Gradients for Improved Dictionary Learning",
author = "Olmo, Jeffrey and
Wilson, Jared and
Forsey, Max and
Hepner, Bryce and
Howe, Thomas Vincent and
Wingate, David",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.423/",
doi = "10.18653/v1/2025.findings-naacl.423",
pages = "7609--7619",
ISBN = "979-8-89176-195-7",
abstract = "Sparse Autoencoders (SAEs) are a promising approach for extracting neural network representations by learning a sparse and overcomplete decomposition of the network{'}s internal activations. However, SAEs are traditionally trained considering only activation values and not the effect those activations have on downstream computations. This limits the information available to learn features, and biases the autoencoder towards neglecting features which are represented with small activation values but strongly influence model outputs.To address this, we introduce Gradient SAEs (g-SAEs), which modify the $k$-sparse autoencoder architecture by augmenting the TopK activation function to rely on the gradients of the input activation when selecting the $k$ elements. For a given sparsity level, g-SAEs produce reconstructions that are more faithful to original network performance when propagated through the network.Additionally, we find evidence that g-SAEs learn latents that are on average more effective at steering models in arbitrary contexts.By considering the downstream effects of activations, our approach leverages the dual nature of neural network features as both representations, retrospectively, and actions, prospectively. While previous methods have approached the problem of feature discovery primarily focused on the former aspect, g-SAEs represent a step towards accounting for the latter as well."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="olmo-etal-2025-features">
<titleInfo>
<title>Features that Make a Difference: Leveraging Gradients for Improved Dictionary Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jeffrey</namePart>
<namePart type="family">Olmo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jared</namePart>
<namePart type="family">Wilson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Max</namePart>
<namePart type="family">Forsey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bryce</namePart>
<namePart type="family">Hepner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="given">Vincent</namePart>
<namePart type="family">Howe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Wingate</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>Sparse Autoencoders (SAEs) are a promising approach for extracting neural network representations by learning a sparse and overcomplete decomposition of the network’s internal activations. However, SAEs are traditionally trained considering only activation values and not the effect those activations have on downstream computations. This limits the information available to learn features, and biases the autoencoder towards neglecting features which are represented with small activation values but strongly influence model outputs.To address this, we introduce Gradient SAEs (g-SAEs), which modify the k-sparse autoencoder architecture by augmenting the TopK activation function to rely on the gradients of the input activation when selecting the k elements. For a given sparsity level, g-SAEs produce reconstructions that are more faithful to original network performance when propagated through the network.Additionally, we find evidence that g-SAEs learn latents that are on average more effective at steering models in arbitrary contexts.By considering the downstream effects of activations, our approach leverages the dual nature of neural network features as both representations, retrospectively, and actions, prospectively. While previous methods have approached the problem of feature discovery primarily focused on the former aspect, g-SAEs represent a step towards accounting for the latter as well.</abstract>
<identifier type="citekey">olmo-etal-2025-features</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.423</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.423/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>7609</start>
<end>7619</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Features that Make a Difference: Leveraging Gradients for Improved Dictionary Learning
%A Olmo, Jeffrey
%A Wilson, Jared
%A Forsey, Max
%A Hepner, Bryce
%A Howe, Thomas Vincent
%A Wingate, David
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F olmo-etal-2025-features
%X Sparse Autoencoders (SAEs) are a promising approach for extracting neural network representations by learning a sparse and overcomplete decomposition of the network’s internal activations. However, SAEs are traditionally trained considering only activation values and not the effect those activations have on downstream computations. This limits the information available to learn features, and biases the autoencoder towards neglecting features which are represented with small activation values but strongly influence model outputs.To address this, we introduce Gradient SAEs (g-SAEs), which modify the k-sparse autoencoder architecture by augmenting the TopK activation function to rely on the gradients of the input activation when selecting the k elements. For a given sparsity level, g-SAEs produce reconstructions that are more faithful to original network performance when propagated through the network.Additionally, we find evidence that g-SAEs learn latents that are on average more effective at steering models in arbitrary contexts.By considering the downstream effects of activations, our approach leverages the dual nature of neural network features as both representations, retrospectively, and actions, prospectively. While previous methods have approached the problem of feature discovery primarily focused on the former aspect, g-SAEs represent a step towards accounting for the latter as well.
%R 10.18653/v1/2025.findings-naacl.423
%U https://aclanthology.org/2025.findings-naacl.423/
%U https://doi.org/10.18653/v1/2025.findings-naacl.423
%P 7609-7619
Markdown (Informal)
[Features that Make a Difference: Leveraging Gradients for Improved Dictionary Learning](https://aclanthology.org/2025.findings-naacl.423/) (Olmo et al., Findings 2025)
ACL