@inproceedings{vladislav-etal-2026-unveiling,
title = "Unveiling Intrinsic Dimension of Texts: from Academic Abstract to Creative Story",
author = "Vladislav, Pedashenko and
Kushnareva, Laida and
Nibal, Yana Khassan and
Tulchinskii, Eduard and
Kuznetsov, Kristian and
Zharchinskii, Vladislav and
Maximov, Yury and
Piontkovskaya, Irina",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.370/",
pages = "7916--7944",
ISBN = "979-8-89176-380-7",
abstract = "Intrinsic dimension (ID) is an important tool in modern LLM analysis, informing studies of training dynamics, scaling behavior, and dataset structure, yet its textual determinants remain underexplored. We provide the first comprehensive study grounding ID in interpretable text properties through cross-encoder analysis, linguistic features, and sparse autoencoders (SAEs). In this work, we establish three key findings. First, ID is complementary to entropy-based metrics: after controlling for length, the two are uncorrelated, with ID capturing geometric complexity orthogonal to prediction quality. Second, ID exhibits robust genre stratification: scientific prose shows low ID ($\sim 8$), encyclopedic content medium ID ($\sim 9$), and creative/opinion writing high ID ($\sim 10.5$) across all models tested. This reveals that contemporary LLMs find scientific text ``representationally simple'' while fiction requires additional degrees of freedom. Third, using SAEs, we identify causal features: scientific signals (formal tone, report templates, statistics) reduce ID; humanized signals (personalization, emotion, narrative) increase it. Steering experiments confirm these effects are causal. Thus, for contemporary models, scientific writing appears comparatively ``easy'', whereas fiction, opinion, and affect add representational degrees of freedom. Our multi-faceted analysis provides practical guidance for the proper use of ID and the sound interpretation of ID-based results."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vladislav-etal-2026-unveiling">
<titleInfo>
<title>Unveiling Intrinsic Dimension of Texts: from Academic Abstract to Creative Story</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pedashenko</namePart>
<namePart type="family">Vladislav</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laida</namePart>
<namePart type="family">Kushnareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yana</namePart>
<namePart type="given">Khassan</namePart>
<namePart type="family">Nibal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eduard</namePart>
<namePart type="family">Tulchinskii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kristian</namePart>
<namePart type="family">Kuznetsov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vladislav</namePart>
<namePart type="family">Zharchinskii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yury</namePart>
<namePart type="family">Maximov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Irina</namePart>
<namePart type="family">Piontkovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>Intrinsic dimension (ID) is an important tool in modern LLM analysis, informing studies of training dynamics, scaling behavior, and dataset structure, yet its textual determinants remain underexplored. We provide the first comprehensive study grounding ID in interpretable text properties through cross-encoder analysis, linguistic features, and sparse autoencoders (SAEs). In this work, we establish three key findings. First, ID is complementary to entropy-based metrics: after controlling for length, the two are uncorrelated, with ID capturing geometric complexity orthogonal to prediction quality. Second, ID exhibits robust genre stratification: scientific prose shows low ID (\sim 8), encyclopedic content medium ID (\sim 9), and creative/opinion writing high ID (\sim 10.5) across all models tested. This reveals that contemporary LLMs find scientific text “representationally simple” while fiction requires additional degrees of freedom. Third, using SAEs, we identify causal features: scientific signals (formal tone, report templates, statistics) reduce ID; humanized signals (personalization, emotion, narrative) increase it. Steering experiments confirm these effects are causal. Thus, for contemporary models, scientific writing appears comparatively “easy”, whereas fiction, opinion, and affect add representational degrees of freedom. Our multi-faceted analysis provides practical guidance for the proper use of ID and the sound interpretation of ID-based results.</abstract>
<identifier type="citekey">vladislav-etal-2026-unveiling</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.370/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>7916</start>
<end>7944</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Unveiling Intrinsic Dimension of Texts: from Academic Abstract to Creative Story
%A Vladislav, Pedashenko
%A Kushnareva, Laida
%A Nibal, Yana Khassan
%A Tulchinskii, Eduard
%A Kuznetsov, Kristian
%A Zharchinskii, Vladislav
%A Maximov, Yury
%A Piontkovskaya, Irina
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F vladislav-etal-2026-unveiling
%X Intrinsic dimension (ID) is an important tool in modern LLM analysis, informing studies of training dynamics, scaling behavior, and dataset structure, yet its textual determinants remain underexplored. We provide the first comprehensive study grounding ID in interpretable text properties through cross-encoder analysis, linguistic features, and sparse autoencoders (SAEs). In this work, we establish three key findings. First, ID is complementary to entropy-based metrics: after controlling for length, the two are uncorrelated, with ID capturing geometric complexity orthogonal to prediction quality. Second, ID exhibits robust genre stratification: scientific prose shows low ID (\sim 8), encyclopedic content medium ID (\sim 9), and creative/opinion writing high ID (\sim 10.5) across all models tested. This reveals that contemporary LLMs find scientific text “representationally simple” while fiction requires additional degrees of freedom. Third, using SAEs, we identify causal features: scientific signals (formal tone, report templates, statistics) reduce ID; humanized signals (personalization, emotion, narrative) increase it. Steering experiments confirm these effects are causal. Thus, for contemporary models, scientific writing appears comparatively “easy”, whereas fiction, opinion, and affect add representational degrees of freedom. Our multi-faceted analysis provides practical guidance for the proper use of ID and the sound interpretation of ID-based results.
%U https://aclanthology.org/2026.eacl-long.370/
%P 7916-7944
Markdown (Informal)
[Unveiling Intrinsic Dimension of Texts: from Academic Abstract to Creative Story](https://aclanthology.org/2026.eacl-long.370/) (Vladislav et al., EACL 2026)
ACL
- Pedashenko Vladislav, Laida Kushnareva, Yana Khassan Nibal, Eduard Tulchinskii, Kristian Kuznetsov, Vladislav Zharchinskii, Yury Maximov, and Irina Piontkovskaya. 2026. Unveiling Intrinsic Dimension of Texts: from Academic Abstract to Creative Story. In Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers), pages 7916–7944, Rabat, Morocco. Association for Computational Linguistics.