@inproceedings{hollinsworth-etal-2024-language,
title = "Language Models Linearly Represent Sentiment",
author = "Hollinsworth, Oskar John and
Tigges, Curt and
Geiger, Atticus and
Nanda, Neel",
editor = "Belinkov, Yonatan and
Kim, Najoung and
Jumelet, Jaap and
Mohebbi, Hosein and
Mueller, Aaron and
Chen, Hanjie",
booktitle = "Proceedings of the 7th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.blackboxnlp-1.5",
pages = "58--87",
abstract = "Sentiment is a pervasive feature in natural language text, yet it is an open question how sentiment is represented within Large Language Models (LLMs). In this study, we reveal that across a range of models, sentiment is represented linearly: a single direction in activation space mostly captures the feature across a range of tasks with one extreme for positive and the other for negative. In a causal analysis, we isolate this direction using interventions and show it is causal in both toy tasks and real world datasets such as Stanford Sentiment Treebank. We analyze the mechanisms that involve this direction and discover a phenomenon which we term the summarization motif: sentiment is not just represented on valenced words, but is also summarized at intermediate positions without inherent sentiment, such as punctuation and names. We show that in SST classification, ablating the sentiment direction across all tokens results in a drop in accuracy from 100{\%} to 62{\%} (vs. 50{\%} random baseline), while ablating the summarized sentiment direction at comma positions alone produces close to half this result (reducing accuracy to 82{\%}).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hollinsworth-etal-2024-language">
<titleInfo>
<title>Language Models Linearly Represent Sentiment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Oskar</namePart>
<namePart type="given">John</namePart>
<namePart type="family">Hollinsworth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Curt</namePart>
<namePart type="family">Tigges</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atticus</namePart>
<namePart type="family">Geiger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Neel</namePart>
<namePart type="family">Nanda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Belinkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Najoung</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaap</namePart>
<namePart type="family">Jumelet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hosein</namePart>
<namePart type="family">Mohebbi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aaron</namePart>
<namePart type="family">Mueller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanjie</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Sentiment is a pervasive feature in natural language text, yet it is an open question how sentiment is represented within Large Language Models (LLMs). In this study, we reveal that across a range of models, sentiment is represented linearly: a single direction in activation space mostly captures the feature across a range of tasks with one extreme for positive and the other for negative. In a causal analysis, we isolate this direction using interventions and show it is causal in both toy tasks and real world datasets such as Stanford Sentiment Treebank. We analyze the mechanisms that involve this direction and discover a phenomenon which we term the summarization motif: sentiment is not just represented on valenced words, but is also summarized at intermediate positions without inherent sentiment, such as punctuation and names. We show that in SST classification, ablating the sentiment direction across all tokens results in a drop in accuracy from 100% to 62% (vs. 50% random baseline), while ablating the summarized sentiment direction at comma positions alone produces close to half this result (reducing accuracy to 82%).</abstract>
<identifier type="citekey">hollinsworth-etal-2024-language</identifier>
<location>
<url>https://aclanthology.org/2024.blackboxnlp-1.5</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>58</start>
<end>87</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Language Models Linearly Represent Sentiment
%A Hollinsworth, Oskar John
%A Tigges, Curt
%A Geiger, Atticus
%A Nanda, Neel
%Y Belinkov, Yonatan
%Y Kim, Najoung
%Y Jumelet, Jaap
%Y Mohebbi, Hosein
%Y Mueller, Aaron
%Y Chen, Hanjie
%S Proceedings of the 7th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F hollinsworth-etal-2024-language
%X Sentiment is a pervasive feature in natural language text, yet it is an open question how sentiment is represented within Large Language Models (LLMs). In this study, we reveal that across a range of models, sentiment is represented linearly: a single direction in activation space mostly captures the feature across a range of tasks with one extreme for positive and the other for negative. In a causal analysis, we isolate this direction using interventions and show it is causal in both toy tasks and real world datasets such as Stanford Sentiment Treebank. We analyze the mechanisms that involve this direction and discover a phenomenon which we term the summarization motif: sentiment is not just represented on valenced words, but is also summarized at intermediate positions without inherent sentiment, such as punctuation and names. We show that in SST classification, ablating the sentiment direction across all tokens results in a drop in accuracy from 100% to 62% (vs. 50% random baseline), while ablating the summarized sentiment direction at comma positions alone produces close to half this result (reducing accuracy to 82%).
%U https://aclanthology.org/2024.blackboxnlp-1.5
%P 58-87
Markdown (Informal)
[Language Models Linearly Represent Sentiment](https://aclanthology.org/2024.blackboxnlp-1.5) (Hollinsworth et al., BlackboxNLP 2024)
ACL
- Oskar John Hollinsworth, Curt Tigges, Atticus Geiger, and Neel Nanda. 2024. Language Models Linearly Represent Sentiment. In Proceedings of the 7th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP, pages 58–87, Miami, Florida, US. Association for Computational Linguistics.