@inproceedings{cho-hockenmaier-2025-toward,
title = "Toward Efficient Sparse Autoencoder-Guided Steering for Improved In-Context Learning in Large Language Models",
author = "Cho, Ikhyun and
Hockenmaier, Julia",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1474/",
pages = "28949--28961",
ISBN = "979-8-89176-332-6",
abstract = "Sparse autoencoders (SAEs) have emerged as a powerful analytical tool in mechanistic interpretability for large language models (LLMs), with growing success in applications beyond interpretability. Building on this momentum, we present a novel approach that leverages SAEs to enhance the general in-context learning (ICL) performance of LLMs.Specifically, we introduce Feature Detection through Prompt Variation (FDPV), which leverages the SAE{'}s remarkable ability to capture subtle differences between prompts, enabling efficient feature selection for downstream steering. In addition, we propose a novel steering method tailored to ICL{---}Selective In-Context Steering (SISTER){---}grounded in recent insights from ICL research that LLMs utilize label words as key anchors. Our method yields a 3.5{\%} average performance improvement across diverse text classification tasks and exhibits greater robustness to hyperparameter variations compared to standard steering approaches. Our code is available at https://github.com/ihcho2/SAE-ICL."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cho-hockenmaier-2025-toward">
<titleInfo>
<title>Toward Efficient Sparse Autoencoder-Guided Steering for Improved In-Context Learning in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ikhyun</namePart>
<namePart type="family">Cho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julia</namePart>
<namePart type="family">Hockenmaier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Sparse autoencoders (SAEs) have emerged as a powerful analytical tool in mechanistic interpretability for large language models (LLMs), with growing success in applications beyond interpretability. Building on this momentum, we present a novel approach that leverages SAEs to enhance the general in-context learning (ICL) performance of LLMs.Specifically, we introduce Feature Detection through Prompt Variation (FDPV), which leverages the SAE’s remarkable ability to capture subtle differences between prompts, enabling efficient feature selection for downstream steering. In addition, we propose a novel steering method tailored to ICL—Selective In-Context Steering (SISTER)—grounded in recent insights from ICL research that LLMs utilize label words as key anchors. Our method yields a 3.5% average performance improvement across diverse text classification tasks and exhibits greater robustness to hyperparameter variations compared to standard steering approaches. Our code is available at https://github.com/ihcho2/SAE-ICL.</abstract>
<identifier type="citekey">cho-hockenmaier-2025-toward</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1474/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>28949</start>
<end>28961</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Toward Efficient Sparse Autoencoder-Guided Steering for Improved In-Context Learning in Large Language Models
%A Cho, Ikhyun
%A Hockenmaier, Julia
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F cho-hockenmaier-2025-toward
%X Sparse autoencoders (SAEs) have emerged as a powerful analytical tool in mechanistic interpretability for large language models (LLMs), with growing success in applications beyond interpretability. Building on this momentum, we present a novel approach that leverages SAEs to enhance the general in-context learning (ICL) performance of LLMs.Specifically, we introduce Feature Detection through Prompt Variation (FDPV), which leverages the SAE’s remarkable ability to capture subtle differences between prompts, enabling efficient feature selection for downstream steering. In addition, we propose a novel steering method tailored to ICL—Selective In-Context Steering (SISTER)—grounded in recent insights from ICL research that LLMs utilize label words as key anchors. Our method yields a 3.5% average performance improvement across diverse text classification tasks and exhibits greater robustness to hyperparameter variations compared to standard steering approaches. Our code is available at https://github.com/ihcho2/SAE-ICL.
%U https://aclanthology.org/2025.emnlp-main.1474/
%P 28949-28961
Markdown (Informal)
[Toward Efficient Sparse Autoencoder-Guided Steering for Improved In-Context Learning in Large Language Models](https://aclanthology.org/2025.emnlp-main.1474/) (Cho & Hockenmaier, EMNLP 2025)
ACL