@inproceedings{piotrowski-etal-2025-will,
title = "When Will the Tokens End? Graph-Based Forecasting for {LLM}s Output Length",
author = "Piotrowski, Grzegorz and
Bystro{\'n}ski, Mateusz and
Ho{\l}ysz, Miko{\l}aj and
Binkowski, Jakub and
Chodak, Grzegorz and
Kajdanowicz, Tomasz Jan",
editor = "Zhao, Jin and
Wang, Mingyang and
Liu, Zhu",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-srw.61/",
doi = "10.18653/v1/2025.acl-srw.61",
pages = "843--848",
ISBN = "979-8-89176-254-1",
abstract = "Large Language Models (LLMs) are typically trained to predict the next token in a sequence. However, their internal representations often encode signals that go beyond immediate next-token prediction. In this work, we investigate whether these hidden states also carry information about the remaining length of the generated output{---}an implicit form of foresight (CITATION). We formulate this as a regression problem where, at generation step $t$, the target is the number of remaining tokens $y_t = T - t$, with $T$ as the total output length.We propose two approaches: (1) an aggregation-based model that combines hidden states from multiple transformer layers $\ell \in \{8, \dots, 15\}$ using element-wise operations such as mean or sum, and (2) a \textit{Layerwise Graph Regressor} that treats layerwise hidden states as nodes in a fully connected graph and applies a Graph Neural Network (GNN) to predict $y_t$. Both models operate on frozen LLM embeddings without requiring end-to-end fine-tuning.Accurately estimating remaining output length has both theoretical and practical implications. From an interpretability standpoint, it suggests that LLMs internally track their generation progress. From a systems perspective, it enables optimizations such as output-length-aware scheduling (CITATION). Our graph-based model achieves state-of-the-art performance on the Alpaca dataset using LLaMA-3-8B-Instruct, reducing normalized mean absolute error (NMAE) by over 50{\%} in short-output scenarios."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="piotrowski-etal-2025-will">
<titleInfo>
<title>When Will the Tokens End? Graph-Based Forecasting for LLMs Output Length</title>
</titleInfo>
<name type="personal">
<namePart type="given">Grzegorz</namePart>
<namePart type="family">Piotrowski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mateusz</namePart>
<namePart type="family">Bystroński</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikołaj</namePart>
<namePart type="family">Hołysz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jakub</namePart>
<namePart type="family">Binkowski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Grzegorz</namePart>
<namePart type="family">Chodak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tomasz</namePart>
<namePart type="given">Jan</namePart>
<namePart type="family">Kajdanowicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jin</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingyang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-254-1</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) are typically trained to predict the next token in a sequence. However, their internal representations often encode signals that go beyond immediate next-token prediction. In this work, we investigate whether these hidden states also carry information about the remaining length of the generated output—an implicit form of foresight (CITATION). We formulate this as a regression problem where, at generation step t, the target is the number of remaining tokens y_t = T - t, with T as the total output length.We propose two approaches: (1) an aggregation-based model that combines hidden states from multiple transformer layers \ell ın {8, \dots, 15} using element-wise operations such as mean or sum, and (2) a Layerwise Graph Regressor that treats layerwise hidden states as nodes in a fully connected graph and applies a Graph Neural Network (GNN) to predict y_t. Both models operate on frozen LLM embeddings without requiring end-to-end fine-tuning.Accurately estimating remaining output length has both theoretical and practical implications. From an interpretability standpoint, it suggests that LLMs internally track their generation progress. From a systems perspective, it enables optimizations such as output-length-aware scheduling (CITATION). Our graph-based model achieves state-of-the-art performance on the Alpaca dataset using LLaMA-3-8B-Instruct, reducing normalized mean absolute error (NMAE) by over 50% in short-output scenarios.</abstract>
<identifier type="citekey">piotrowski-etal-2025-will</identifier>
<identifier type="doi">10.18653/v1/2025.acl-srw.61</identifier>
<location>
<url>https://aclanthology.org/2025.acl-srw.61/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>843</start>
<end>848</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T When Will the Tokens End? Graph-Based Forecasting for LLMs Output Length
%A Piotrowski, Grzegorz
%A Bystroński, Mateusz
%A Hołysz, Mikołaj
%A Binkowski, Jakub
%A Chodak, Grzegorz
%A Kajdanowicz, Tomasz Jan
%Y Zhao, Jin
%Y Wang, Mingyang
%Y Liu, Zhu
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-254-1
%F piotrowski-etal-2025-will
%X Large Language Models (LLMs) are typically trained to predict the next token in a sequence. However, their internal representations often encode signals that go beyond immediate next-token prediction. In this work, we investigate whether these hidden states also carry information about the remaining length of the generated output—an implicit form of foresight (CITATION). We formulate this as a regression problem where, at generation step t, the target is the number of remaining tokens y_t = T - t, with T as the total output length.We propose two approaches: (1) an aggregation-based model that combines hidden states from multiple transformer layers \ell ın {8, \dots, 15} using element-wise operations such as mean or sum, and (2) a Layerwise Graph Regressor that treats layerwise hidden states as nodes in a fully connected graph and applies a Graph Neural Network (GNN) to predict y_t. Both models operate on frozen LLM embeddings without requiring end-to-end fine-tuning.Accurately estimating remaining output length has both theoretical and practical implications. From an interpretability standpoint, it suggests that LLMs internally track their generation progress. From a systems perspective, it enables optimizations such as output-length-aware scheduling (CITATION). Our graph-based model achieves state-of-the-art performance on the Alpaca dataset using LLaMA-3-8B-Instruct, reducing normalized mean absolute error (NMAE) by over 50% in short-output scenarios.
%R 10.18653/v1/2025.acl-srw.61
%U https://aclanthology.org/2025.acl-srw.61/
%U https://doi.org/10.18653/v1/2025.acl-srw.61
%P 843-848Markdown (Informal)
[When Will the Tokens End? Graph-Based Forecasting for LLMs Output Length](https://aclanthology.org/2025.acl-srw.61/) (Piotrowski et al., ACL 2025)
ACL
- Grzegorz Piotrowski, Mateusz Bystroński, Mikołaj Hołysz, Jakub Binkowski, Grzegorz Chodak, and Tomasz Jan Kajdanowicz. 2025. When Will the Tokens End? Graph-Based Forecasting for LLMs Output Length. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop), pages 843–848, Vienna, Austria. Association for Computational Linguistics.