@inproceedings{syu-lee-2025-hierarchical,
title = "Hierarchical Speculative Decoding with Dynamic Window",
author = "Syu, Shensian and
Lee, Hung-yi",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.462/",
doi = "10.18653/v1/2025.findings-naacl.462",
pages = "8260--8273",
ISBN = "979-8-89176-195-7",
abstract = "Speculative Decoding (SD) utilizes an efficient draft model to generate multiple tokens, which are subsequently verified in parallel by a target model. This approach has shown significant potential for accelerating inference in large language models (LLMs), with performance heavily reliant on the hyperparameter $K${---}the window size. However, previous methods often depend on simple heuristics to select $K$ or dynamically adjust the window size, which may necessitate additional training or careful resource management to avoid competition.To address these challenges, we propose \textbf{H}ierarchical \textbf{S}peculative \textbf{D}ecoding with \textbf{D}ynamic \textbf{W}indow (HSDDW), a straightforward framework that eliminates the need for additional training. Specifically, we introduce a \textit{self-verify} mechanism that enables the draft model to autonomously decide when to stop generating tokens. Additionally, by integrating a hierarchical structure that leverages the capabilities of models of different sizes, we significantly enhance the overall speed of the system.HSDDW demonstrates competitive performance across four datasets, achieving notable speedups of $2.91\times$ on MT-Bench and $2.99\times$ on Alpaca, outperforming existing state-of-the-art methods."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="syu-lee-2025-hierarchical">
<titleInfo>
<title>Hierarchical Speculative Decoding with Dynamic Window</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shensian</namePart>
<namePart type="family">Syu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hung-yi</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>Speculative Decoding (SD) utilizes an efficient draft model to generate multiple tokens, which are subsequently verified in parallel by a target model. This approach has shown significant potential for accelerating inference in large language models (LLMs), with performance heavily reliant on the hyperparameter K—the window size. However, previous methods often depend on simple heuristics to select K or dynamically adjust the window size, which may necessitate additional training or careful resource management to avoid competition.To address these challenges, we propose Hierarchical Speculative Decoding with Dynamic Window (HSDDW), a straightforward framework that eliminates the need for additional training. Specifically, we introduce a self-verify mechanism that enables the draft model to autonomously decide when to stop generating tokens. Additionally, by integrating a hierarchical structure that leverages the capabilities of models of different sizes, we significantly enhance the overall speed of the system.HSDDW demonstrates competitive performance across four datasets, achieving notable speedups of 2.91\times on MT-Bench and 2.99\times on Alpaca, outperforming existing state-of-the-art methods.</abstract>
<identifier type="citekey">syu-lee-2025-hierarchical</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.462</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.462/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>8260</start>
<end>8273</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Hierarchical Speculative Decoding with Dynamic Window
%A Syu, Shensian
%A Lee, Hung-yi
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F syu-lee-2025-hierarchical
%X Speculative Decoding (SD) utilizes an efficient draft model to generate multiple tokens, which are subsequently verified in parallel by a target model. This approach has shown significant potential for accelerating inference in large language models (LLMs), with performance heavily reliant on the hyperparameter K—the window size. However, previous methods often depend on simple heuristics to select K or dynamically adjust the window size, which may necessitate additional training or careful resource management to avoid competition.To address these challenges, we propose Hierarchical Speculative Decoding with Dynamic Window (HSDDW), a straightforward framework that eliminates the need for additional training. Specifically, we introduce a self-verify mechanism that enables the draft model to autonomously decide when to stop generating tokens. Additionally, by integrating a hierarchical structure that leverages the capabilities of models of different sizes, we significantly enhance the overall speed of the system.HSDDW demonstrates competitive performance across four datasets, achieving notable speedups of 2.91\times on MT-Bench and 2.99\times on Alpaca, outperforming existing state-of-the-art methods.
%R 10.18653/v1/2025.findings-naacl.462
%U https://aclanthology.org/2025.findings-naacl.462/
%U https://doi.org/10.18653/v1/2025.findings-naacl.462
%P 8260-8273
Markdown (Informal)
[Hierarchical Speculative Decoding with Dynamic Window](https://aclanthology.org/2025.findings-naacl.462/) (Syu & Lee, Findings 2025)
ACL