@inproceedings{katz-etal-2024-backward,
title = "Backward Lens: Projecting Language Model Gradients into the Vocabulary Space",
author = "Katz, Shahar and
Belinkov, Yonatan and
Geva, Mor and
Wolf, Lior",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.142",
doi = "10.18653/v1/2024.emnlp-main.142",
pages = "2390--2422",
abstract = "Understanding how Transformer-based Language Models (LMs) learn and recall information is a key goal of the deep learning community. Recent interpretability methods project weights and hidden states obtained from the forward pass to the models{'} vocabularies, helping to uncover how information flows within LMs. In this work, we extend this methodology to LMs{'} backward pass and gradients. We first prove that a gradient matrix can be cast as a low-rank linear combination of its forward and backward passes{'} inputs. We then develop methods to project these gradients into vocabulary items and explore the mechanics of how new information is stored in the LMs{'} neurons.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="katz-etal-2024-backward">
<titleInfo>
<title>Backward Lens: Projecting Language Model Gradients into the Vocabulary Space</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shahar</namePart>
<namePart type="family">Katz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Belinkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mor</namePart>
<namePart type="family">Geva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lior</namePart>
<namePart type="family">Wolf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Understanding how Transformer-based Language Models (LMs) learn and recall information is a key goal of the deep learning community. Recent interpretability methods project weights and hidden states obtained from the forward pass to the models’ vocabularies, helping to uncover how information flows within LMs. In this work, we extend this methodology to LMs’ backward pass and gradients. We first prove that a gradient matrix can be cast as a low-rank linear combination of its forward and backward passes’ inputs. We then develop methods to project these gradients into vocabulary items and explore the mechanics of how new information is stored in the LMs’ neurons.</abstract>
<identifier type="citekey">katz-etal-2024-backward</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.142</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.142</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>2390</start>
<end>2422</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Backward Lens: Projecting Language Model Gradients into the Vocabulary Space
%A Katz, Shahar
%A Belinkov, Yonatan
%A Geva, Mor
%A Wolf, Lior
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F katz-etal-2024-backward
%X Understanding how Transformer-based Language Models (LMs) learn and recall information is a key goal of the deep learning community. Recent interpretability methods project weights and hidden states obtained from the forward pass to the models’ vocabularies, helping to uncover how information flows within LMs. In this work, we extend this methodology to LMs’ backward pass and gradients. We first prove that a gradient matrix can be cast as a low-rank linear combination of its forward and backward passes’ inputs. We then develop methods to project these gradients into vocabulary items and explore the mechanics of how new information is stored in the LMs’ neurons.
%R 10.18653/v1/2024.emnlp-main.142
%U https://aclanthology.org/2024.emnlp-main.142
%U https://doi.org/10.18653/v1/2024.emnlp-main.142
%P 2390-2422
Markdown (Informal)
[Backward Lens: Projecting Language Model Gradients into the Vocabulary Space](https://aclanthology.org/2024.emnlp-main.142) (Katz et al., EMNLP 2024)
ACL