@inproceedings{ghilardi-etal-2024-accelerating,
title = "Accelerating Sparse Autoencoder Training via Layer-Wise Transfer Learning in Large Language Models",
author = "Ghilardi, Davide and
Belotti, Federico and
Molinari, Marco and
Lim, Jaehyuk",
editor = "Belinkov, Yonatan and
Kim, Najoung and
Jumelet, Jaap and
Mohebbi, Hosein and
Mueller, Aaron and
Chen, Hanjie",
booktitle = "Proceedings of the 7th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.blackboxnlp-1.32",
pages = "530--550",
abstract = "Sparse AutoEncoders (SAEs) have gained popularity as a tool for enhancing the interpretability of Large Language Models (LLMs). However, training SAEs can be computationally intensive, especially as model complexity grows. In this study, the potential of transfer learning to accelerate SAEs training is explored by capitalizing on the shared representations found across adjacent layers of LLMs. Our experimental results demonstrate that fine-tuning SAEs using pre-trained models from nearby layers not only maintains but often improves the quality of learned representations, while significantly accelerating convergence. These findings indicate that the strategic reuse of pretrained SAEs is a promising approach, particularly in settings where computational resources are constrained.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ghilardi-etal-2024-accelerating">
<titleInfo>
<title>Accelerating Sparse Autoencoder Training via Layer-Wise Transfer Learning in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Davide</namePart>
<namePart type="family">Ghilardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Federico</namePart>
<namePart type="family">Belotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Molinari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaehyuk</namePart>
<namePart type="family">Lim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Belinkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Najoung</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaap</namePart>
<namePart type="family">Jumelet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hosein</namePart>
<namePart type="family">Mohebbi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aaron</namePart>
<namePart type="family">Mueller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanjie</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Sparse AutoEncoders (SAEs) have gained popularity as a tool for enhancing the interpretability of Large Language Models (LLMs). However, training SAEs can be computationally intensive, especially as model complexity grows. In this study, the potential of transfer learning to accelerate SAEs training is explored by capitalizing on the shared representations found across adjacent layers of LLMs. Our experimental results demonstrate that fine-tuning SAEs using pre-trained models from nearby layers not only maintains but often improves the quality of learned representations, while significantly accelerating convergence. These findings indicate that the strategic reuse of pretrained SAEs is a promising approach, particularly in settings where computational resources are constrained.</abstract>
<identifier type="citekey">ghilardi-etal-2024-accelerating</identifier>
<location>
<url>https://aclanthology.org/2024.blackboxnlp-1.32</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>530</start>
<end>550</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Accelerating Sparse Autoencoder Training via Layer-Wise Transfer Learning in Large Language Models
%A Ghilardi, Davide
%A Belotti, Federico
%A Molinari, Marco
%A Lim, Jaehyuk
%Y Belinkov, Yonatan
%Y Kim, Najoung
%Y Jumelet, Jaap
%Y Mohebbi, Hosein
%Y Mueller, Aaron
%Y Chen, Hanjie
%S Proceedings of the 7th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F ghilardi-etal-2024-accelerating
%X Sparse AutoEncoders (SAEs) have gained popularity as a tool for enhancing the interpretability of Large Language Models (LLMs). However, training SAEs can be computationally intensive, especially as model complexity grows. In this study, the potential of transfer learning to accelerate SAEs training is explored by capitalizing on the shared representations found across adjacent layers of LLMs. Our experimental results demonstrate that fine-tuning SAEs using pre-trained models from nearby layers not only maintains but often improves the quality of learned representations, while significantly accelerating convergence. These findings indicate that the strategic reuse of pretrained SAEs is a promising approach, particularly in settings where computational resources are constrained.
%U https://aclanthology.org/2024.blackboxnlp-1.32
%P 530-550
Markdown (Informal)
[Accelerating Sparse Autoencoder Training via Layer-Wise Transfer Learning in Large Language Models](https://aclanthology.org/2024.blackboxnlp-1.32) (Ghilardi et al., BlackboxNLP 2024)
ACL