@inproceedings{janiak-etal-2024-adversarial,
title = "An Adversarial Example for Direct Logit Attribution: Memory Management in {GELU}-4{L}",
author = "Janiak, Jett and
Rager, Can and
Dao, James and
Lau, Yeu-Tong",
editor = "Belinkov, Yonatan and
Kim, Najoung and
Jumelet, Jaap and
Mohebbi, Hosein and
Mueller, Aaron and
Chen, Hanjie",
booktitle = "Proceedings of the 7th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.blackboxnlp-1.15",
pages = "232--237",
abstract = "Prior work suggests that language models manage the limited bandwidth of the residual stream through a {``}memory management{''} mechanism, where certain attention heads and MLP layers clear residual stream directions set by earlier layers. Our study provides concrete evidence for this erasure phenomenon in a 4-layer transformer, identifying heads that consistently remove the output of earlier heads. We further demonstrate that direct logit attribution (DLA), a common technique for interpreting the output of intermediate transformer layers, can show misleading results by not accounting for erasure.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="janiak-etal-2024-adversarial">
<titleInfo>
<title>An Adversarial Example for Direct Logit Attribution: Memory Management in GELU-4L</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jett</namePart>
<namePart type="family">Janiak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Can</namePart>
<namePart type="family">Rager</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Dao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yeu-Tong</namePart>
<namePart type="family">Lau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Belinkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Najoung</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaap</namePart>
<namePart type="family">Jumelet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hosein</namePart>
<namePart type="family">Mohebbi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aaron</namePart>
<namePart type="family">Mueller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanjie</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Prior work suggests that language models manage the limited bandwidth of the residual stream through a “memory management” mechanism, where certain attention heads and MLP layers clear residual stream directions set by earlier layers. Our study provides concrete evidence for this erasure phenomenon in a 4-layer transformer, identifying heads that consistently remove the output of earlier heads. We further demonstrate that direct logit attribution (DLA), a common technique for interpreting the output of intermediate transformer layers, can show misleading results by not accounting for erasure.</abstract>
<identifier type="citekey">janiak-etal-2024-adversarial</identifier>
<location>
<url>https://aclanthology.org/2024.blackboxnlp-1.15</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>232</start>
<end>237</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Adversarial Example for Direct Logit Attribution: Memory Management in GELU-4L
%A Janiak, Jett
%A Rager, Can
%A Dao, James
%A Lau, Yeu-Tong
%Y Belinkov, Yonatan
%Y Kim, Najoung
%Y Jumelet, Jaap
%Y Mohebbi, Hosein
%Y Mueller, Aaron
%Y Chen, Hanjie
%S Proceedings of the 7th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F janiak-etal-2024-adversarial
%X Prior work suggests that language models manage the limited bandwidth of the residual stream through a “memory management” mechanism, where certain attention heads and MLP layers clear residual stream directions set by earlier layers. Our study provides concrete evidence for this erasure phenomenon in a 4-layer transformer, identifying heads that consistently remove the output of earlier heads. We further demonstrate that direct logit attribution (DLA), a common technique for interpreting the output of intermediate transformer layers, can show misleading results by not accounting for erasure.
%U https://aclanthology.org/2024.blackboxnlp-1.15
%P 232-237
Markdown (Informal)
[An Adversarial Example for Direct Logit Attribution: Memory Management in GELU-4L](https://aclanthology.org/2024.blackboxnlp-1.15) (Janiak et al., BlackboxNLP 2024)
ACL