@inproceedings{pankratov-alistarh-2026-speculative,
title = "Speculative Decoding Speed-of-Light: Optimal Lower Bounds via Branching Random Walks",
author = "Pankratov, Sergey and
Alistarh, Dan",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.301/",
pages = "6404--6418",
ISBN = "979-8-89176-380-7",
abstract = "Speculative generation has emerged as a promising technique to accelerate inference in large language models (LLMs) by leveraging parallelism to verify multiple draft tokens simultaneously. However, the fundamental limits on the achievable speedup remain poorly understood. In this work, we establish the first ``tight'' lower bounds on the runtime of any deterministic speculative generation algorithm. This is achieved by drawing a parallel between the token generation process and branching random walks, which allows us to analyze the optimal draft tree selection problem. We prove, under basic assumptions, that the expected number of tokens successfully predicted per speculative iteration is bounded as $\mathbb{E}[X] \leq (\mu + \mu_{(2)})\log(B )/\mu^2 + O(1)$, where $B$ is the verifier{'}s batch size, $\mu$ is the expected entropy of the verifier{'}s output distribution, and $\mu_{(2)}$ is this entropy{'}s second moment. This result provides new insights into the limits of parallel token generation, and could guide the design of future speculative decoding systems. Empirical evaluations on Llama models validate our theoretical predictions, confirming the tightness of our bounds in practical settings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pankratov-alistarh-2026-speculative">
<titleInfo>
<title>Speculative Decoding Speed-of-Light: Optimal Lower Bounds via Branching Random Walks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sergey</namePart>
<namePart type="family">Pankratov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Alistarh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>Speculative generation has emerged as a promising technique to accelerate inference in large language models (LLMs) by leveraging parallelism to verify multiple draft tokens simultaneously. However, the fundamental limits on the achievable speedup remain poorly understood. In this work, we establish the first “tight” lower bounds on the runtime of any deterministic speculative generation algorithm. This is achieved by drawing a parallel between the token generation process and branching random walks, which allows us to analyze the optimal draft tree selection problem. We prove, under basic assumptions, that the expected number of tokens successfully predicted per speculative iteration is bounded as \mathbbE[X] łeq (μ + μ_(2))łog(B )/μ² + O(1), where B is the verifier’s batch size, μ is the expected entropy of the verifier’s output distribution, and μ_(2) is this entropy’s second moment. This result provides new insights into the limits of parallel token generation, and could guide the design of future speculative decoding systems. Empirical evaluations on Llama models validate our theoretical predictions, confirming the tightness of our bounds in practical settings.</abstract>
<identifier type="citekey">pankratov-alistarh-2026-speculative</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.301/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>6404</start>
<end>6418</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Speculative Decoding Speed-of-Light: Optimal Lower Bounds via Branching Random Walks
%A Pankratov, Sergey
%A Alistarh, Dan
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F pankratov-alistarh-2026-speculative
%X Speculative generation has emerged as a promising technique to accelerate inference in large language models (LLMs) by leveraging parallelism to verify multiple draft tokens simultaneously. However, the fundamental limits on the achievable speedup remain poorly understood. In this work, we establish the first “tight” lower bounds on the runtime of any deterministic speculative generation algorithm. This is achieved by drawing a parallel between the token generation process and branching random walks, which allows us to analyze the optimal draft tree selection problem. We prove, under basic assumptions, that the expected number of tokens successfully predicted per speculative iteration is bounded as \mathbbE[X] łeq (μ + μ_(2))łog(B )/μ² + O(1), where B is the verifier’s batch size, μ is the expected entropy of the verifier’s output distribution, and μ_(2) is this entropy’s second moment. This result provides new insights into the limits of parallel token generation, and could guide the design of future speculative decoding systems. Empirical evaluations on Llama models validate our theoretical predictions, confirming the tightness of our bounds in practical settings.
%U https://aclanthology.org/2026.eacl-long.301/
%P 6404-6418
Markdown (Informal)
[Speculative Decoding Speed-of-Light: Optimal Lower Bounds via Branching Random Walks](https://aclanthology.org/2026.eacl-long.301/) (Pankratov & Alistarh, EACL 2026)
ACL