@inproceedings{tak-etal-2025-mechanistic,
title = "Mechanistic Interpretability of Emotion Inference in Large Language Models",
author = "Tak, Ala N. and
Banayeeanzade, Amin and
Bolourani, Anahita and
Kian, Mina and
Jia, Robin and
Gratch, Jonathan",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.679/",
doi = "10.18653/v1/2025.findings-acl.679",
pages = "13090--13120",
ISBN = "979-8-89176-256-5",
abstract = "Large language models (LLMs) show promising capabilities in predicting human emotions from text. However, the mechanisms through which these models process emotional stimuli remain largely unexplored. Our study addresses this gap by investigating how autoregressive LLMs infer emotions, showing that emotion representations are functionally localized to specific regions in the model. Our evaluation includes diverse model families and sizes, and is supported by robustness checks. We then show that the identified representations are psychologically plausible by drawing on cognitive appraisal theory{---}a well-established psychological framework positing that emotions emerge from evaluations (appraisals) of environmental stimuli. By causally intervening on construed appraisal concepts, we steer the generation and show that the outputs align with theoretical and intuitive expectations. This work highlights a novel way to causally intervene and control emotion inference, potentially benefiting safety and alignment in sensitive affective domains."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tak-etal-2025-mechanistic">
<titleInfo>
<title>Mechanistic Interpretability of Emotion Inference in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ala</namePart>
<namePart type="given">N</namePart>
<namePart type="family">Tak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amin</namePart>
<namePart type="family">Banayeeanzade</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anahita</namePart>
<namePart type="family">Bolourani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mina</namePart>
<namePart type="family">Kian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robin</namePart>
<namePart type="family">Jia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="family">Gratch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Large language models (LLMs) show promising capabilities in predicting human emotions from text. However, the mechanisms through which these models process emotional stimuli remain largely unexplored. Our study addresses this gap by investigating how autoregressive LLMs infer emotions, showing that emotion representations are functionally localized to specific regions in the model. Our evaluation includes diverse model families and sizes, and is supported by robustness checks. We then show that the identified representations are psychologically plausible by drawing on cognitive appraisal theory—a well-established psychological framework positing that emotions emerge from evaluations (appraisals) of environmental stimuli. By causally intervening on construed appraisal concepts, we steer the generation and show that the outputs align with theoretical and intuitive expectations. This work highlights a novel way to causally intervene and control emotion inference, potentially benefiting safety and alignment in sensitive affective domains.</abstract>
<identifier type="citekey">tak-etal-2025-mechanistic</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.679</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.679/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>13090</start>
<end>13120</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mechanistic Interpretability of Emotion Inference in Large Language Models
%A Tak, Ala N.
%A Banayeeanzade, Amin
%A Bolourani, Anahita
%A Kian, Mina
%A Jia, Robin
%A Gratch, Jonathan
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F tak-etal-2025-mechanistic
%X Large language models (LLMs) show promising capabilities in predicting human emotions from text. However, the mechanisms through which these models process emotional stimuli remain largely unexplored. Our study addresses this gap by investigating how autoregressive LLMs infer emotions, showing that emotion representations are functionally localized to specific regions in the model. Our evaluation includes diverse model families and sizes, and is supported by robustness checks. We then show that the identified representations are psychologically plausible by drawing on cognitive appraisal theory—a well-established psychological framework positing that emotions emerge from evaluations (appraisals) of environmental stimuli. By causally intervening on construed appraisal concepts, we steer the generation and show that the outputs align with theoretical and intuitive expectations. This work highlights a novel way to causally intervene and control emotion inference, potentially benefiting safety and alignment in sensitive affective domains.
%R 10.18653/v1/2025.findings-acl.679
%U https://aclanthology.org/2025.findings-acl.679/
%U https://doi.org/10.18653/v1/2025.findings-acl.679
%P 13090-13120
Markdown (Informal)
[Mechanistic Interpretability of Emotion Inference in Large Language Models](https://aclanthology.org/2025.findings-acl.679/) (Tak et al., Findings 2025)
ACL