@inproceedings{garces-arias-etal-2025-geometry,
title = "The Geometry of Creative Variability: How Credal Sets Expose Calibration Gaps in Language Models",
author = "Garces Arias, Esteban and
Rodemann, Julian and
Heumann, Christian",
editor = "Noidea, Noidea",
booktitle = "Proceedings of the 2nd Workshop on Uncertainty-Aware NLP (UncertaiNLP 2025)",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.uncertainlp-main.5/",
pages = "36--50",
ISBN = "979-8-89176-349-4",
abstract = "Understanding uncertainty in large language models remains a fundamental challenge, particularly in creative tasks where multiple valid outputs exist. We present a geometric framework using credal sets{---}convex hulls of probability distributions{---}to quantify and decompose uncertainty in neural text generation, calibrated against human creative variation. Analyzing 500 creative writing prompts from the dataset with 10 unique human continuations each, we evaluate four language models across five decoding strategies, generating 100,000 stories. Our credal set analysis reveals substantial gaps in capturing human creative variation, with the best model-human calibration reaching only 0.434 (Gemma-2B with temperature 0.7). We decompose total uncertainty into \textit{epistemic} and \textit{aleatoric} components, finding that the choice of decoding strategy contributes 39.4{\%} to 72.0{\%} of total epistemic uncertainty. Model scale shows weak correlation with calibration quality and no significant difference exists between base and instruction-tuned models in calibration quality. Our geometric framework provides actionable insights for improving generation systems for human-AI creative alignment. We release our complete experimental framework at \url{https://github.com/EstebanGarces/uncertainHuman}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="garces-arias-etal-2025-geometry">
<titleInfo>
<title>The Geometry of Creative Variability: How Credal Sets Expose Calibration Gaps in Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Esteban</namePart>
<namePart type="family">Garces Arias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julian</namePart>
<namePart type="family">Rodemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Heumann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Uncertainty-Aware NLP (UncertaiNLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Noidea</namePart>
<namePart type="family">Noidea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-349-4</identifier>
</relatedItem>
<abstract>Understanding uncertainty in large language models remains a fundamental challenge, particularly in creative tasks where multiple valid outputs exist. We present a geometric framework using credal sets—convex hulls of probability distributions—to quantify and decompose uncertainty in neural text generation, calibrated against human creative variation. Analyzing 500 creative writing prompts from the dataset with 10 unique human continuations each, we evaluate four language models across five decoding strategies, generating 100,000 stories. Our credal set analysis reveals substantial gaps in capturing human creative variation, with the best model-human calibration reaching only 0.434 (Gemma-2B with temperature 0.7). We decompose total uncertainty into epistemic and aleatoric components, finding that the choice of decoding strategy contributes 39.4% to 72.0% of total epistemic uncertainty. Model scale shows weak correlation with calibration quality and no significant difference exists between base and instruction-tuned models in calibration quality. Our geometric framework provides actionable insights for improving generation systems for human-AI creative alignment. We release our complete experimental framework at https://github.com/EstebanGarces/uncertainHuman.</abstract>
<identifier type="citekey">garces-arias-etal-2025-geometry</identifier>
<location>
<url>https://aclanthology.org/2025.uncertainlp-main.5/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>36</start>
<end>50</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Geometry of Creative Variability: How Credal Sets Expose Calibration Gaps in Language Models
%A Garces Arias, Esteban
%A Rodemann, Julian
%A Heumann, Christian
%Y Noidea, Noidea
%S Proceedings of the 2nd Workshop on Uncertainty-Aware NLP (UncertaiNLP 2025)
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-349-4
%F garces-arias-etal-2025-geometry
%X Understanding uncertainty in large language models remains a fundamental challenge, particularly in creative tasks where multiple valid outputs exist. We present a geometric framework using credal sets—convex hulls of probability distributions—to quantify and decompose uncertainty in neural text generation, calibrated against human creative variation. Analyzing 500 creative writing prompts from the dataset with 10 unique human continuations each, we evaluate four language models across five decoding strategies, generating 100,000 stories. Our credal set analysis reveals substantial gaps in capturing human creative variation, with the best model-human calibration reaching only 0.434 (Gemma-2B with temperature 0.7). We decompose total uncertainty into epistemic and aleatoric components, finding that the choice of decoding strategy contributes 39.4% to 72.0% of total epistemic uncertainty. Model scale shows weak correlation with calibration quality and no significant difference exists between base and instruction-tuned models in calibration quality. Our geometric framework provides actionable insights for improving generation systems for human-AI creative alignment. We release our complete experimental framework at https://github.com/EstebanGarces/uncertainHuman.
%U https://aclanthology.org/2025.uncertainlp-main.5/
%P 36-50
Markdown (Informal)
[The Geometry of Creative Variability: How Credal Sets Expose Calibration Gaps in Language Models](https://aclanthology.org/2025.uncertainlp-main.5/) (Garces Arias et al., UncertaiNLP 2025)
ACL