@inproceedings{corona-mendozza-sogaard-2026-llm,
title = "{LLM} Beliefs Are in Their Heads",
author = "Corona Mendozza, Alessandro and
S{\o}gaard, Anders",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1905/",
doi = "10.18653/v1/2026.acl-long.1905",
pages = "41033--41067",
ISBN = "979-8-89176-390-6",
abstract = "We investigate belief-like representations in decoder-only autoregressive LLMs using linear controlled probes on residual stream activations and single attention heads. Following Herrmann and Levinstein{'}s (2025) criteria (Accuracy, Use, Coherence, and Uniformity) we find that large models exhibit strong truth sensitivity (Accuracy), and steering activations along probe directions reliably changes downstream behavior (Use). Coherence, measured via calibrated probes and cross-dataset probing, is moderate across models, while training on diverse data yields domain-consistent truth directions (Uniformity). The results are particularly encouraging at the head level and align with some standard philosophical accounts of belief, e.g., minimal functionalism, supporting the view that LLMs can maintain propositional attitudes under such theoretical frameworks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="corona-mendozza-sogaard-2026-llm">
<titleInfo>
<title>LLM Beliefs Are in Their Heads</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Corona Mendozza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anders</namePart>
<namePart type="family">Søgaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>We investigate belief-like representations in decoder-only autoregressive LLMs using linear controlled probes on residual stream activations and single attention heads. Following Herrmann and Levinstein’s (2025) criteria (Accuracy, Use, Coherence, and Uniformity) we find that large models exhibit strong truth sensitivity (Accuracy), and steering activations along probe directions reliably changes downstream behavior (Use). Coherence, measured via calibrated probes and cross-dataset probing, is moderate across models, while training on diverse data yields domain-consistent truth directions (Uniformity). The results are particularly encouraging at the head level and align with some standard philosophical accounts of belief, e.g., minimal functionalism, supporting the view that LLMs can maintain propositional attitudes under such theoretical frameworks.</abstract>
<identifier type="citekey">corona-mendozza-sogaard-2026-llm</identifier>
<identifier type="doi">10.18653/v1/2026.acl-long.1905</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1905/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>41033</start>
<end>41067</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LLM Beliefs Are in Their Heads
%A Corona Mendozza, Alessandro
%A Søgaard, Anders
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F corona-mendozza-sogaard-2026-llm
%X We investigate belief-like representations in decoder-only autoregressive LLMs using linear controlled probes on residual stream activations and single attention heads. Following Herrmann and Levinstein’s (2025) criteria (Accuracy, Use, Coherence, and Uniformity) we find that large models exhibit strong truth sensitivity (Accuracy), and steering activations along probe directions reliably changes downstream behavior (Use). Coherence, measured via calibrated probes and cross-dataset probing, is moderate across models, while training on diverse data yields domain-consistent truth directions (Uniformity). The results are particularly encouraging at the head level and align with some standard philosophical accounts of belief, e.g., minimal functionalism, supporting the view that LLMs can maintain propositional attitudes under such theoretical frameworks.
%R 10.18653/v1/2026.acl-long.1905
%U https://aclanthology.org/2026.acl-long.1905/
%U https://doi.org/10.18653/v1/2026.acl-long.1905
%P 41033-41067
Markdown (Informal)
[LLM Beliefs Are in Their Heads](https://aclanthology.org/2026.acl-long.1905/) (Corona Mendozza & Søgaard, ACL 2026)
ACL
- Alessandro Corona Mendozza and Anders Søgaard. 2026. LLM Beliefs Are in Their Heads. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 41033–41067, San Diego, California, United States. Association for Computational Linguistics.