@inproceedings{sun-etal-2024-spechub,
title = "{S}pec{H}ub: Provable Acceleration to Multi-Draft Speculative Decoding",
author = "Sun, Ryan and
Zhou, Tianyi and
Chen, Xun and
Sun, Lichao",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1148",
pages = "20620--20641",
abstract = "Large Language Models (LLMs) have become essential in advancing natural language processing (NLP) tasks, but their sequential token generation limits inference speed. Multi-Draft Speculative Decoding (MDSD) offers a promising solution by using a smaller draft model to generate multiple token sequences, which the target LLM verifies in parallel.However, current heuristic approaches, such as Recursive Rejection Sampling (RRS), suffer from low acceptance rates in subsequent drafts, limiting the advantages of using multiple drafts. Meanwhile, Optimal Transport with Membership Cost (OTM) can theoretically improve acceptance rates, but its computational cost is too high for real-time use.We present SpecHub, a novel, efficient sampling-verification method for MDSD that improves acceptance rates with only linear computational overhead. By simplifying the OTM problem into a compact Linear Programming model, SpecHub significantly reduces computational complexity. It further accelerates sampling by leveraging a sparse joint distribution, focusing computation on high-probability token sequences.{\%}It integrates seamlessly into existing MDSD frameworks.In extensive experiments, Spechub consistently generates 0.05-0.27 and 0.02-0.16 more tokens per step than RRS and RRS without replacement. We attach our code at https://github.com/MasterGodzilla/Speculative{\_}decoding{\_}OT.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sun-etal-2024-spechub">
<titleInfo>
<title>SpecHub: Provable Acceleration to Multi-Draft Speculative Decoding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianyi</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xun</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lichao</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large Language Models (LLMs) have become essential in advancing natural language processing (NLP) tasks, but their sequential token generation limits inference speed. Multi-Draft Speculative Decoding (MDSD) offers a promising solution by using a smaller draft model to generate multiple token sequences, which the target LLM verifies in parallel.However, current heuristic approaches, such as Recursive Rejection Sampling (RRS), suffer from low acceptance rates in subsequent drafts, limiting the advantages of using multiple drafts. Meanwhile, Optimal Transport with Membership Cost (OTM) can theoretically improve acceptance rates, but its computational cost is too high for real-time use.We present SpecHub, a novel, efficient sampling-verification method for MDSD that improves acceptance rates with only linear computational overhead. By simplifying the OTM problem into a compact Linear Programming model, SpecHub significantly reduces computational complexity. It further accelerates sampling by leveraging a sparse joint distribution, focusing computation on high-probability token sequences.%It integrates seamlessly into existing MDSD frameworks.In extensive experiments, Spechub consistently generates 0.05-0.27 and 0.02-0.16 more tokens per step than RRS and RRS without replacement. We attach our code at https://github.com/MasterGodzilla/Speculative_decoding_OT.</abstract>
<identifier type="citekey">sun-etal-2024-spechub</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1148</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>20620</start>
<end>20641</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SpecHub: Provable Acceleration to Multi-Draft Speculative Decoding
%A Sun, Ryan
%A Zhou, Tianyi
%A Chen, Xun
%A Sun, Lichao
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F sun-etal-2024-spechub
%X Large Language Models (LLMs) have become essential in advancing natural language processing (NLP) tasks, but their sequential token generation limits inference speed. Multi-Draft Speculative Decoding (MDSD) offers a promising solution by using a smaller draft model to generate multiple token sequences, which the target LLM verifies in parallel.However, current heuristic approaches, such as Recursive Rejection Sampling (RRS), suffer from low acceptance rates in subsequent drafts, limiting the advantages of using multiple drafts. Meanwhile, Optimal Transport with Membership Cost (OTM) can theoretically improve acceptance rates, but its computational cost is too high for real-time use.We present SpecHub, a novel, efficient sampling-verification method for MDSD that improves acceptance rates with only linear computational overhead. By simplifying the OTM problem into a compact Linear Programming model, SpecHub significantly reduces computational complexity. It further accelerates sampling by leveraging a sparse joint distribution, focusing computation on high-probability token sequences.%It integrates seamlessly into existing MDSD frameworks.In extensive experiments, Spechub consistently generates 0.05-0.27 and 0.02-0.16 more tokens per step than RRS and RRS without replacement. We attach our code at https://github.com/MasterGodzilla/Speculative_decoding_OT.
%U https://aclanthology.org/2024.emnlp-main.1148
%P 20620-20641
Markdown (Informal)
[SpecHub: Provable Acceleration to Multi-Draft Speculative Decoding](https://aclanthology.org/2024.emnlp-main.1148) (Sun et al., EMNLP 2024)
ACL