@article{waldis-etal-2026-aligned,
title = "Aligned Probing: Relating Toxic Behavior and Model Internals",
author = "Waldis, Andreas and
Gautam, Vagrant and
Lauscher, Anne and
Klakow, Dietrich and
Gurevych, Iryna",
journal = "Transactions of the Association for Computational Linguistics",
volume = "14",
year = "2026",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2026.tacl-1.14/",
doi = "10.1162/tacl.a.613",
pages = "271--291",
abstract = "Warning: This paper contains offensive text. We introduce aligned probing, a novel interpretability framework that aligns the behavior of language models (LMs), based on their outputs, and their internal representations (internals). Using this framework, we examine over 20 OLMo, Llama, and Mistral models, bridging behavioral and internal perspectives for toxicity for the first time. Our results show that LMs strongly encode information about the toxicity level of inputs and subsequent outputs, particularly in lower layers. Focusing on how unique LMs differ offers both correlative and causal evidence that they generate less toxic output when strongly encoding information about the input toxicity. We also highlight the heterogeneity of toxicity, as model behavior and internals vary across unique attributes such as Threat. Finally, four case studies analyzing detoxification, multi-prompt evaluations, model quantization, and pre-training dynamics underline the practical impact of aligned probing with further concrete insights. Our findings contribute to a more holistic understanding of LMs, both within and beyond the context of toxicity. alignedprobing.github.io"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="waldis-etal-2026-aligned">
<titleInfo>
<title>Aligned Probing: Relating Toxic Behavior and Model Internals</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Waldis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vagrant</namePart>
<namePart type="family">Gautam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anne</namePart>
<namePart type="family">Lauscher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dietrich</namePart>
<namePart type="family">Klakow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iryna</namePart>
<namePart type="family">Gurevych</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Warning: This paper contains offensive text. We introduce aligned probing, a novel interpretability framework that aligns the behavior of language models (LMs), based on their outputs, and their internal representations (internals). Using this framework, we examine over 20 OLMo, Llama, and Mistral models, bridging behavioral and internal perspectives for toxicity for the first time. Our results show that LMs strongly encode information about the toxicity level of inputs and subsequent outputs, particularly in lower layers. Focusing on how unique LMs differ offers both correlative and causal evidence that they generate less toxic output when strongly encoding information about the input toxicity. We also highlight the heterogeneity of toxicity, as model behavior and internals vary across unique attributes such as Threat. Finally, four case studies analyzing detoxification, multi-prompt evaluations, model quantization, and pre-training dynamics underline the practical impact of aligned probing with further concrete insights. Our findings contribute to a more holistic understanding of LMs, both within and beyond the context of toxicity. alignedprobing.github.io</abstract>
<identifier type="citekey">waldis-etal-2026-aligned</identifier>
<identifier type="doi">10.1162/tacl.a.613</identifier>
<location>
<url>https://aclanthology.org/2026.tacl-1.14/</url>
</location>
<part>
<date>2026</date>
<detail type="volume"><number>14</number></detail>
<extent unit="page">
<start>271</start>
<end>291</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Aligned Probing: Relating Toxic Behavior and Model Internals
%A Waldis, Andreas
%A Gautam, Vagrant
%A Lauscher, Anne
%A Klakow, Dietrich
%A Gurevych, Iryna
%J Transactions of the Association for Computational Linguistics
%D 2026
%V 14
%I MIT Press
%C Cambridge, MA
%F waldis-etal-2026-aligned
%X Warning: This paper contains offensive text. We introduce aligned probing, a novel interpretability framework that aligns the behavior of language models (LMs), based on their outputs, and their internal representations (internals). Using this framework, we examine over 20 OLMo, Llama, and Mistral models, bridging behavioral and internal perspectives for toxicity for the first time. Our results show that LMs strongly encode information about the toxicity level of inputs and subsequent outputs, particularly in lower layers. Focusing on how unique LMs differ offers both correlative and causal evidence that they generate less toxic output when strongly encoding information about the input toxicity. We also highlight the heterogeneity of toxicity, as model behavior and internals vary across unique attributes such as Threat. Finally, four case studies analyzing detoxification, multi-prompt evaluations, model quantization, and pre-training dynamics underline the practical impact of aligned probing with further concrete insights. Our findings contribute to a more holistic understanding of LMs, both within and beyond the context of toxicity. alignedprobing.github.io
%R 10.1162/tacl.a.613
%U https://aclanthology.org/2026.tacl-1.14/
%U https://doi.org/10.1162/tacl.a.613
%P 271-291
Markdown (Informal)
[Aligned Probing: Relating Toxic Behavior and Model Internals](https://aclanthology.org/2026.tacl-1.14/) (Waldis et al., TACL 2026)
ACL