@inproceedings{kumar-2026-pressure,
title = "Pressure-Testing Deception Probes in {LLM}s: Scaling, Robustness, and the Geometry of Deceptive Representations",
author = "Kumar, Sachin",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.gem-main.43/",
pages = "472--489",
ISBN = "979-8-89176-423-1",
abstract = "Linear probes trained on internal activations of Large Language Models (LLMs) are increasingly proposed as evaluation metrics for deceptive generation, automated monitors that score whether a model{'}s output was produced deceptively, without requiring ground-truth labels or human annotation. Yet these metrics report AUROC scores exceeding 0.96 on clean benchmarks while demonstrating profound fragility under distributional shift. This paper presents a systematic pressure-test of such probe-based evaluation metrics across the Gemma 3 model family (1B{--}27B parameters), diagnosing why they fail rather than merely documenting that they fail. We investigate four competing hypotheses about how deception is encoded: as (1) a single linear direction, (2) a multi-dimensional subspace, (3) a convex conic hull, or (4) a proxy for computational entropy. Our experimental design includes cross-domain transfer matrices, multi-dimensional probe analysis with permutation null baselines, entropy-residualization tests, and systematic distractor evaluations across 8 stylistic shifts. Across all four model scales, we find that: (a) probe-based metrics achieve near-perfect AUROC ($\ge$0.998) on clean data but collapse under stylistic shifts when trained without stylistic augmentation, style-augmented probes recover near-perfect detection (mean AUROC{~}0.979{--}0.983) even on unseen styles; (b) the single-direction hypothesis is decisively rejected (k=1 captures only 0.61{--}0.80 AUROC of the signal, with cross-domain transfer failure confirmed as geometric rather than layer-mismatch-driven; (c) the entropy-proxy hypothesis is rejected (maximum $|\rho|=0.454$, maximum $\Delta$AUROC after residualization=0.004); and (d) deception does not form a statistically significant linear subspace even within individual domains (per-domain $k^*{=}0$), yet multi-dimensional probes (k$\ge$5) consistently recover the signal through distributed sub-threshold features. These findings demonstrate that probe fragility under standard training reflects distributional narrowness rather than a fundamental architectural limitation: style-augmented probes recover near-perfect detection (mean AUROC 0.979{--}0.983 on unseen styles) at both the 4B and 27B scales, establishing that the inverse scaling pattern observed under standard training is a training-distribution artifact rather than a genuine scale-dependent phenomenon."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kumar-2026-pressure">
<titleInfo>
<title>Pressure-Testing Deception Probes in LLMs: Scaling, Robustness, and the Geometry of Deceptive Representations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sachin</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrícia</namePart>
<namePart type="family">Schmidtová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Dušek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marzieh</namePart>
<namePart type="family">Fadaee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-423-1</identifier>
</relatedItem>
<abstract>Linear probes trained on internal activations of Large Language Models (LLMs) are increasingly proposed as evaluation metrics for deceptive generation, automated monitors that score whether a model’s output was produced deceptively, without requiring ground-truth labels or human annotation. Yet these metrics report AUROC scores exceeding 0.96 on clean benchmarks while demonstrating profound fragility under distributional shift. This paper presents a systematic pressure-test of such probe-based evaluation metrics across the Gemma 3 model family (1B–27B parameters), diagnosing why they fail rather than merely documenting that they fail. We investigate four competing hypotheses about how deception is encoded: as (1) a single linear direction, (2) a multi-dimensional subspace, (3) a convex conic hull, or (4) a proxy for computational entropy. Our experimental design includes cross-domain transfer matrices, multi-dimensional probe analysis with permutation null baselines, entropy-residualization tests, and systematic distractor evaluations across 8 stylistic shifts. Across all four model scales, we find that: (a) probe-based metrics achieve near-perfect AUROC (\ge0.998) on clean data but collapse under stylistic shifts when trained without stylistic augmentation, style-augmented probes recover near-perfect detection (mean AUROC 0.979–0.983) even on unseen styles; (b) the single-direction hypothesis is decisively rejected (k=1 captures only 0.61–0.80 AUROC of the signal, with cross-domain transfer failure confirmed as geometric rather than layer-mismatch-driven; (c) the entropy-proxy hypothesis is rejected (maximum |ρ|=0.454, maximum ΔAUROC after residualization=0.004); and (d) deception does not form a statistically significant linear subspace even within individual domains (per-domain k^*=0), yet multi-dimensional probes (k\ge5) consistently recover the signal through distributed sub-threshold features. These findings demonstrate that probe fragility under standard training reflects distributional narrowness rather than a fundamental architectural limitation: style-augmented probes recover near-perfect detection (mean AUROC 0.979–0.983 on unseen styles) at both the 4B and 27B scales, establishing that the inverse scaling pattern observed under standard training is a training-distribution artifact rather than a genuine scale-dependent phenomenon.</abstract>
<identifier type="citekey">kumar-2026-pressure</identifier>
<location>
<url>https://aclanthology.org/2026.gem-main.43/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>472</start>
<end>489</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Pressure-Testing Deception Probes in LLMs: Scaling, Robustness, and the Geometry of Deceptive Representations
%A Kumar, Sachin
%Y Mille, Simon
%Y Gehrmann, Sebastian
%Y Schmidtová, Patrícia
%Y Dušek, Ondřej
%Y Fadaee, Marzieh
%Y Lo, Kyle
%Y Santus, Enrico
%Y Stanovsky, Gabriel
%S Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-423-1
%F kumar-2026-pressure
%X Linear probes trained on internal activations of Large Language Models (LLMs) are increasingly proposed as evaluation metrics for deceptive generation, automated monitors that score whether a model’s output was produced deceptively, without requiring ground-truth labels or human annotation. Yet these metrics report AUROC scores exceeding 0.96 on clean benchmarks while demonstrating profound fragility under distributional shift. This paper presents a systematic pressure-test of such probe-based evaluation metrics across the Gemma 3 model family (1B–27B parameters), diagnosing why they fail rather than merely documenting that they fail. We investigate four competing hypotheses about how deception is encoded: as (1) a single linear direction, (2) a multi-dimensional subspace, (3) a convex conic hull, or (4) a proxy for computational entropy. Our experimental design includes cross-domain transfer matrices, multi-dimensional probe analysis with permutation null baselines, entropy-residualization tests, and systematic distractor evaluations across 8 stylistic shifts. Across all four model scales, we find that: (a) probe-based metrics achieve near-perfect AUROC (\ge0.998) on clean data but collapse under stylistic shifts when trained without stylistic augmentation, style-augmented probes recover near-perfect detection (mean AUROC 0.979–0.983) even on unseen styles; (b) the single-direction hypothesis is decisively rejected (k=1 captures only 0.61–0.80 AUROC of the signal, with cross-domain transfer failure confirmed as geometric rather than layer-mismatch-driven; (c) the entropy-proxy hypothesis is rejected (maximum |ρ|=0.454, maximum ΔAUROC after residualization=0.004); and (d) deception does not form a statistically significant linear subspace even within individual domains (per-domain k^*=0), yet multi-dimensional probes (k\ge5) consistently recover the signal through distributed sub-threshold features. These findings demonstrate that probe fragility under standard training reflects distributional narrowness rather than a fundamental architectural limitation: style-augmented probes recover near-perfect detection (mean AUROC 0.979–0.983 on unseen styles) at both the 4B and 27B scales, establishing that the inverse scaling pattern observed under standard training is a training-distribution artifact rather than a genuine scale-dependent phenomenon.
%U https://aclanthology.org/2026.gem-main.43/
%P 472-489
Markdown (Informal)
[Pressure-Testing Deception Probes in LLMs: Scaling, Robustness, and the Geometry of Deceptive Representations](https://aclanthology.org/2026.gem-main.43/) (Kumar, GEM 2026)
ACL