@inproceedings{lin-etal-2024-slim,
title = "{SL}i{M}: Speculative Decoding with Hypothesis Reduction",
author = "Lin, Chi-Heng and
Tuli, Shikhar and
Smith, James and
Hsu, Yen-Chang and
Shen, Yilin and
Jin, Hongxia",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.63",
doi = "10.18653/v1/2024.findings-naacl.63",
pages = "1005--1017",
abstract = "Speculative decoding has emerged as a prominent alternative to autoregressive decoding for expediting inference in large language models (LLMs). However, prevailing assumptions often focus solely on latency reduction, neglecting the computational expenses. In this paper, we present \textbf{S}peculate \textbf{L}ess, val\textbf{i}date \textbf{M}ore (SLiM), a speculative decoding enhancement to reduce the speculation set while validating more effective tokens. SLiM is designed to mitigate LLMs{'} computation costs associated with the token verification by introducing hypothesis reduction based on a fast posterior estimation. It consistently surpasses counterparts lacking cost reduction across a spectrum from CPU to GPU. Our evaluation with diverse conversational datasets shows that SLiM can achieve a substantial 70{\%} reduction in FLOPs while generating more effective predictions on top of prior arts.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lin-etal-2024-slim">
<titleInfo>
<title>SLiM: Speculative Decoding with Hypothesis Reduction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chi-Heng</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shikhar</namePart>
<namePart type="family">Tuli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Smith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yen-Chang</namePart>
<namePart type="family">Hsu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yilin</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongxia</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Speculative decoding has emerged as a prominent alternative to autoregressive decoding for expediting inference in large language models (LLMs). However, prevailing assumptions often focus solely on latency reduction, neglecting the computational expenses. In this paper, we present Speculate Less, validate More (SLiM), a speculative decoding enhancement to reduce the speculation set while validating more effective tokens. SLiM is designed to mitigate LLMs’ computation costs associated with the token verification by introducing hypothesis reduction based on a fast posterior estimation. It consistently surpasses counterparts lacking cost reduction across a spectrum from CPU to GPU. Our evaluation with diverse conversational datasets shows that SLiM can achieve a substantial 70% reduction in FLOPs while generating more effective predictions on top of prior arts.</abstract>
<identifier type="citekey">lin-etal-2024-slim</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.63</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.63</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>1005</start>
<end>1017</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SLiM: Speculative Decoding with Hypothesis Reduction
%A Lin, Chi-Heng
%A Tuli, Shikhar
%A Smith, James
%A Hsu, Yen-Chang
%A Shen, Yilin
%A Jin, Hongxia
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F lin-etal-2024-slim
%X Speculative decoding has emerged as a prominent alternative to autoregressive decoding for expediting inference in large language models (LLMs). However, prevailing assumptions often focus solely on latency reduction, neglecting the computational expenses. In this paper, we present Speculate Less, validate More (SLiM), a speculative decoding enhancement to reduce the speculation set while validating more effective tokens. SLiM is designed to mitigate LLMs’ computation costs associated with the token verification by introducing hypothesis reduction based on a fast posterior estimation. It consistently surpasses counterparts lacking cost reduction across a spectrum from CPU to GPU. Our evaluation with diverse conversational datasets shows that SLiM can achieve a substantial 70% reduction in FLOPs while generating more effective predictions on top of prior arts.
%R 10.18653/v1/2024.findings-naacl.63
%U https://aclanthology.org/2024.findings-naacl.63
%U https://doi.org/10.18653/v1/2024.findings-naacl.63
%P 1005-1017
Markdown (Informal)
[SLiM: Speculative Decoding with Hypothesis Reduction](https://aclanthology.org/2024.findings-naacl.63) (Lin et al., Findings 2024)
ACL
- Chi-Heng Lin, Shikhar Tuli, James Smith, Yen-Chang Hsu, Yilin Shen, and Hongxia Jin. 2024. SLiM: Speculative Decoding with Hypothesis Reduction. In Findings of the Association for Computational Linguistics: NAACL 2024, pages 1005–1017, Mexico City, Mexico. Association for Computational Linguistics.