@inproceedings{du-2026-geometric,
title = "Geometric Deviation as an Unsupervised Pre-Generation Reliability Signal: Probing {LLM} Representations for Answerability",
author = "Du, Yucheng",
editor = "Chang, Kai-Wei and
Mehrabi, Ninareh and
Krishna, Satyapriya and
Das, Anubrata and
Dhamala, Jwala and
Cao, Yang Trista and
Kumarage, Tharindu and
Ramakrishna, Anil and
Christodoulopoulos, Christos and
Wan, Yixin and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 6th Workshop on Trustworthy {NLP} ({T}rust{NLP} 2026)",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.trustnlp-main.22/",
pages = "353--363",
ISBN = "979-8-89176-418-7",
abstract = "A reliable language model should be able to signal, prior to generation, when a query falls outside its knowledge. We investigate whether representation geometry can provide such a pre-generation signal by measuring the deviation of hidden states from an answerable reference set, requiring no labeled failure data and no access to model outputs.Across three instruction-tuned models (Llama 3.1-8B, Qwen 2.5-7B, and Mistral-7B-Instruct) and three prompt forms (Math, Fact, Code), we find that geometry primarily encodes task form. Within mathematical prompts, unanswerable inputs consistently deviate from the answerable centroid, yielding strong separation (ROC-AUC 0.78-0.84). This single-pass pre-generation signal outperforms a simple refusal baseline and compares favorably to self-consistency. It also captures cases where models do not explicitly refuse.In contrast, no reliable geometric signal emerges for factual prompts, indicating that the effect is form-conditional rather than universal. Code prompts show large effect sizes with higher variance, suggesting partial generalization beyond mathematical form.A layer-wise analysis reveals that the signal arises in early layers and gradually attenuates toward the output. These results suggest that answerability-related geometry is established before the final stages of generation. Together, these findings indicate that geometric deviation can serve as a lightweight pre-generation signal that is reliable in structured domains with formal answerability constraints, with clear boundaries on where it generalizes."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="du-2026-geometric">
<titleInfo>
<title>Geometric Deviation as an Unsupervised Pre-Generation Reliability Signal: Probing LLM Representations for Answerability</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yucheng</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th Workshop on Trustworthy NLP (TrustNLP 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ninareh</namePart>
<namePart type="family">Mehrabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Satyapriya</namePart>
<namePart type="family">Krishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anubrata</namePart>
<namePart type="family">Das</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jwala</namePart>
<namePart type="family">Dhamala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="given">Trista</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Kumarage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anil</namePart>
<namePart type="family">Ramakrishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yixin</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aram</namePart>
<namePart type="family">Galystan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-418-7</identifier>
</relatedItem>
<abstract>A reliable language model should be able to signal, prior to generation, when a query falls outside its knowledge. We investigate whether representation geometry can provide such a pre-generation signal by measuring the deviation of hidden states from an answerable reference set, requiring no labeled failure data and no access to model outputs.Across three instruction-tuned models (Llama 3.1-8B, Qwen 2.5-7B, and Mistral-7B-Instruct) and three prompt forms (Math, Fact, Code), we find that geometry primarily encodes task form. Within mathematical prompts, unanswerable inputs consistently deviate from the answerable centroid, yielding strong separation (ROC-AUC 0.78-0.84). This single-pass pre-generation signal outperforms a simple refusal baseline and compares favorably to self-consistency. It also captures cases where models do not explicitly refuse.In contrast, no reliable geometric signal emerges for factual prompts, indicating that the effect is form-conditional rather than universal. Code prompts show large effect sizes with higher variance, suggesting partial generalization beyond mathematical form.A layer-wise analysis reveals that the signal arises in early layers and gradually attenuates toward the output. These results suggest that answerability-related geometry is established before the final stages of generation. Together, these findings indicate that geometric deviation can serve as a lightweight pre-generation signal that is reliable in structured domains with formal answerability constraints, with clear boundaries on where it generalizes.</abstract>
<identifier type="citekey">du-2026-geometric</identifier>
<location>
<url>https://aclanthology.org/2026.trustnlp-main.22/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>353</start>
<end>363</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Geometric Deviation as an Unsupervised Pre-Generation Reliability Signal: Probing LLM Representations for Answerability
%A Du, Yucheng
%Y Chang, Kai-Wei
%Y Mehrabi, Ninareh
%Y Krishna, Satyapriya
%Y Das, Anubrata
%Y Dhamala, Jwala
%Y Cao, Yang Trista
%Y Kumarage, Tharindu
%Y Ramakrishna, Anil
%Y Christodoulopoulos, Christos
%Y Wan, Yixin
%Y Galystan, Aram
%Y Kumar, Anoop
%Y Gupta, Rahul
%S Proceedings of the 6th Workshop on Trustworthy NLP (TrustNLP 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-418-7
%F du-2026-geometric
%X A reliable language model should be able to signal, prior to generation, when a query falls outside its knowledge. We investigate whether representation geometry can provide such a pre-generation signal by measuring the deviation of hidden states from an answerable reference set, requiring no labeled failure data and no access to model outputs.Across three instruction-tuned models (Llama 3.1-8B, Qwen 2.5-7B, and Mistral-7B-Instruct) and three prompt forms (Math, Fact, Code), we find that geometry primarily encodes task form. Within mathematical prompts, unanswerable inputs consistently deviate from the answerable centroid, yielding strong separation (ROC-AUC 0.78-0.84). This single-pass pre-generation signal outperforms a simple refusal baseline and compares favorably to self-consistency. It also captures cases where models do not explicitly refuse.In contrast, no reliable geometric signal emerges for factual prompts, indicating that the effect is form-conditional rather than universal. Code prompts show large effect sizes with higher variance, suggesting partial generalization beyond mathematical form.A layer-wise analysis reveals that the signal arises in early layers and gradually attenuates toward the output. These results suggest that answerability-related geometry is established before the final stages of generation. Together, these findings indicate that geometric deviation can serve as a lightweight pre-generation signal that is reliable in structured domains with formal answerability constraints, with clear boundaries on where it generalizes.
%U https://aclanthology.org/2026.trustnlp-main.22/
%P 353-363
Markdown (Informal)
[Geometric Deviation as an Unsupervised Pre-Generation Reliability Signal: Probing LLM Representations for Answerability](https://aclanthology.org/2026.trustnlp-main.22/) (Du, TrustNLP 2026)
ACL