@inproceedings{chang-etal-2025-safr,
title = "{SAFR}: Neuron Redistribution for Interpretability",
author = "Chang, Ruidi and
Deng, Chunyuan and
Chen, Hanjie",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.112/",
doi = "10.18653/v1/2025.findings-naacl.112",
pages = "2117--2126",
ISBN = "979-8-89176-195-7",
abstract = "Superposition refers to encoding representations of multiple features within a single neuron, which is common in deep neural networks. This property allows neurons to combine and represent multiple features, enabling the model to capture intricate information and handle complex tasks. Despite promising performance, the model{'}s interpretability has been diminished. This paper presents a novel approach to enhance model interpretability by regularizing feature superposition. We introduce SAFR, which simply applies regularizations to the loss function to promote monosemantic representations for important tokens while encouraging polysemanticity for correlated token pairs, where important tokens and correlated token pairs are identified via VMASK and attention weights respectively. We evaluate SAFR with a transformer model on two classification tasks. Experiments demonstrate the effectiveness of SAFR in improving model interpretability without compromising prediction performance. Besides, SAFR provides explanations by visualizing the neuron allocation within the intermediate layers."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chang-etal-2025-safr">
<titleInfo>
<title>SAFR: Neuron Redistribution for Interpretability</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruidi</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chunyuan</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanjie</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>Superposition refers to encoding representations of multiple features within a single neuron, which is common in deep neural networks. This property allows neurons to combine and represent multiple features, enabling the model to capture intricate information and handle complex tasks. Despite promising performance, the model’s interpretability has been diminished. This paper presents a novel approach to enhance model interpretability by regularizing feature superposition. We introduce SAFR, which simply applies regularizations to the loss function to promote monosemantic representations for important tokens while encouraging polysemanticity for correlated token pairs, where important tokens and correlated token pairs are identified via VMASK and attention weights respectively. We evaluate SAFR with a transformer model on two classification tasks. Experiments demonstrate the effectiveness of SAFR in improving model interpretability without compromising prediction performance. Besides, SAFR provides explanations by visualizing the neuron allocation within the intermediate layers.</abstract>
<identifier type="citekey">chang-etal-2025-safr</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.112</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.112/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>2117</start>
<end>2126</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SAFR: Neuron Redistribution for Interpretability
%A Chang, Ruidi
%A Deng, Chunyuan
%A Chen, Hanjie
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F chang-etal-2025-safr
%X Superposition refers to encoding representations of multiple features within a single neuron, which is common in deep neural networks. This property allows neurons to combine and represent multiple features, enabling the model to capture intricate information and handle complex tasks. Despite promising performance, the model’s interpretability has been diminished. This paper presents a novel approach to enhance model interpretability by regularizing feature superposition. We introduce SAFR, which simply applies regularizations to the loss function to promote monosemantic representations for important tokens while encouraging polysemanticity for correlated token pairs, where important tokens and correlated token pairs are identified via VMASK and attention weights respectively. We evaluate SAFR with a transformer model on two classification tasks. Experiments demonstrate the effectiveness of SAFR in improving model interpretability without compromising prediction performance. Besides, SAFR provides explanations by visualizing the neuron allocation within the intermediate layers.
%R 10.18653/v1/2025.findings-naacl.112
%U https://aclanthology.org/2025.findings-naacl.112/
%U https://doi.org/10.18653/v1/2025.findings-naacl.112
%P 2117-2126
Markdown (Informal)
[SAFR: Neuron Redistribution for Interpretability](https://aclanthology.org/2025.findings-naacl.112/) (Chang et al., Findings 2025)
ACL
- Ruidi Chang, Chunyuan Deng, and Hanjie Chen. 2025. SAFR: Neuron Redistribution for Interpretability. In Findings of the Association for Computational Linguistics: NAACL 2025, pages 2117–2126, Albuquerque, New Mexico. Association for Computational Linguistics.