@inproceedings{nanda-etal-2023-emergent,
title = "Emergent Linear Representations in World Models of Self-Supervised Sequence Models",
author = "Nanda, Neel and
Lee, Andrew and
Wattenberg, Martin",
editor = "Belinkov, Yonatan and
Hao, Sophie and
Jumelet, Jaap and
Kim, Najoung and
McCarthy, Arya and
Mohebbi, Hosein",
booktitle = "Proceedings of the 6th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.blackboxnlp-1.2",
doi = "10.18653/v1/2023.blackboxnlp-1.2",
pages = "16--30",
abstract = "How do sequence models represent their decision-making process? Prior work suggests that Othello-playing neural network learned nonlinear models of the board state (Li et al., 2023a). In this work, we provide evidence of a closely related linear representation of the board. In particular, we show that probing for {``}my colour{''} vs. {``}opponent{'}s colour{''} may be a simple yet powerful way to interpret the model{'}s internal state. This precise understanding of the internal representations allows us to control the model{'}s behaviour with simple vector arithmetic. Linear representations enable significant interpretability progress, which we demonstrate with further exploration of how the world model is computed.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nanda-etal-2023-emergent">
<titleInfo>
<title>Emergent Linear Representations in World Models of Self-Supervised Sequence Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Neel</namePart>
<namePart type="family">Nanda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrew</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martin</namePart>
<namePart type="family">Wattenberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Belinkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophie</namePart>
<namePart type="family">Hao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaap</namePart>
<namePart type="family">Jumelet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Najoung</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arya</namePart>
<namePart type="family">McCarthy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hosein</namePart>
<namePart type="family">Mohebbi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>How do sequence models represent their decision-making process? Prior work suggests that Othello-playing neural network learned nonlinear models of the board state (Li et al., 2023a). In this work, we provide evidence of a closely related linear representation of the board. In particular, we show that probing for “my colour” vs. “opponent’s colour” may be a simple yet powerful way to interpret the model’s internal state. This precise understanding of the internal representations allows us to control the model’s behaviour with simple vector arithmetic. Linear representations enable significant interpretability progress, which we demonstrate with further exploration of how the world model is computed.</abstract>
<identifier type="citekey">nanda-etal-2023-emergent</identifier>
<identifier type="doi">10.18653/v1/2023.blackboxnlp-1.2</identifier>
<location>
<url>https://aclanthology.org/2023.blackboxnlp-1.2</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>16</start>
<end>30</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Emergent Linear Representations in World Models of Self-Supervised Sequence Models
%A Nanda, Neel
%A Lee, Andrew
%A Wattenberg, Martin
%Y Belinkov, Yonatan
%Y Hao, Sophie
%Y Jumelet, Jaap
%Y Kim, Najoung
%Y McCarthy, Arya
%Y Mohebbi, Hosein
%S Proceedings of the 6th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F nanda-etal-2023-emergent
%X How do sequence models represent their decision-making process? Prior work suggests that Othello-playing neural network learned nonlinear models of the board state (Li et al., 2023a). In this work, we provide evidence of a closely related linear representation of the board. In particular, we show that probing for “my colour” vs. “opponent’s colour” may be a simple yet powerful way to interpret the model’s internal state. This precise understanding of the internal representations allows us to control the model’s behaviour with simple vector arithmetic. Linear representations enable significant interpretability progress, which we demonstrate with further exploration of how the world model is computed.
%R 10.18653/v1/2023.blackboxnlp-1.2
%U https://aclanthology.org/2023.blackboxnlp-1.2
%U https://doi.org/10.18653/v1/2023.blackboxnlp-1.2
%P 16-30
Markdown (Informal)
[Emergent Linear Representations in World Models of Self-Supervised Sequence Models](https://aclanthology.org/2023.blackboxnlp-1.2) (Nanda et al., BlackboxNLP-WS 2023)
ACL