@inproceedings{qian-etal-2024-bass,
title = "{BASS}: Batched Attention-optimized Speculative Sampling",
author = "Qian, Haifeng and
Gonugondla, Sujan Kumar and
Ha, Sungsoo and
Shang, Mingyue and
Gouda, Sanjay Krishna and
Nallapati, Ramesh and
Sengupta, Sudipta and
Ma, Xiaofei and
Deoras, Anoop",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.489",
doi = "10.18653/v1/2024.findings-acl.489",
pages = "8214--8224",
abstract = "Speculative decoding has emerged as a powerful method to improve latency and throughput in hosting large language models. However, most existing implementations focus on generating a single sequence. Real-world generative AI applications often require multiple responses and how to perform speculative decoding in a batched setting while preserving its latency benefits poses non-trivial challenges. This paper describes a system of batched speculative decoding that sets a new state of the art in multi-sequence generation latency and that demonstrates superior GPU utilization as well as quality of generations within a time budget. For example, for a 7.8B-size model on a single A100 GPU and with a batch size of 8, each sequence is generated at an average speed of 5.8ms per token, the overall throughput being 1.1K tokens per second. These results represent state-of-the-art latency and a 2.15$\times$ speed-up over optimized regular decoding. Within a time budget that regular decoding does not finish, our system is able to generate sequences with HumanEval Pass@First of 43{\%} and Pass@All of 61{\%}, far exceeding what{'}s feasible with single-sequence speculative decoding. Our peak GPU utilization during decoding reaches as high as 15.8{\%}, more than 3$\times$ the highest of that of regular decoding and around 10$\times$ of single-sequence speculative decoding.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="qian-etal-2024-bass">
<titleInfo>
<title>BASS: Batched Attention-optimized Speculative Sampling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haifeng</namePart>
<namePart type="family">Qian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sujan</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Gonugondla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sungsoo</namePart>
<namePart type="family">Ha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingyue</namePart>
<namePart type="family">Shang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sanjay</namePart>
<namePart type="given">Krishna</namePart>
<namePart type="family">Gouda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramesh</namePart>
<namePart type="family">Nallapati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sudipta</namePart>
<namePart type="family">Sengupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaofei</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Deoras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Speculative decoding has emerged as a powerful method to improve latency and throughput in hosting large language models. However, most existing implementations focus on generating a single sequence. Real-world generative AI applications often require multiple responses and how to perform speculative decoding in a batched setting while preserving its latency benefits poses non-trivial challenges. This paper describes a system of batched speculative decoding that sets a new state of the art in multi-sequence generation latency and that demonstrates superior GPU utilization as well as quality of generations within a time budget. For example, for a 7.8B-size model on a single A100 GPU and with a batch size of 8, each sequence is generated at an average speed of 5.8ms per token, the overall throughput being 1.1K tokens per second. These results represent state-of-the-art latency and a 2.15\times speed-up over optimized regular decoding. Within a time budget that regular decoding does not finish, our system is able to generate sequences with HumanEval Pass@First of 43% and Pass@All of 61%, far exceeding what’s feasible with single-sequence speculative decoding. Our peak GPU utilization during decoding reaches as high as 15.8%, more than 3\times the highest of that of regular decoding and around 10\times of single-sequence speculative decoding.</abstract>
<identifier type="citekey">qian-etal-2024-bass</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.489</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.489</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>8214</start>
<end>8224</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BASS: Batched Attention-optimized Speculative Sampling
%A Qian, Haifeng
%A Gonugondla, Sujan Kumar
%A Ha, Sungsoo
%A Shang, Mingyue
%A Gouda, Sanjay Krishna
%A Nallapati, Ramesh
%A Sengupta, Sudipta
%A Ma, Xiaofei
%A Deoras, Anoop
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F qian-etal-2024-bass
%X Speculative decoding has emerged as a powerful method to improve latency and throughput in hosting large language models. However, most existing implementations focus on generating a single sequence. Real-world generative AI applications often require multiple responses and how to perform speculative decoding in a batched setting while preserving its latency benefits poses non-trivial challenges. This paper describes a system of batched speculative decoding that sets a new state of the art in multi-sequence generation latency and that demonstrates superior GPU utilization as well as quality of generations within a time budget. For example, for a 7.8B-size model on a single A100 GPU and with a batch size of 8, each sequence is generated at an average speed of 5.8ms per token, the overall throughput being 1.1K tokens per second. These results represent state-of-the-art latency and a 2.15\times speed-up over optimized regular decoding. Within a time budget that regular decoding does not finish, our system is able to generate sequences with HumanEval Pass@First of 43% and Pass@All of 61%, far exceeding what’s feasible with single-sequence speculative decoding. Our peak GPU utilization during decoding reaches as high as 15.8%, more than 3\times the highest of that of regular decoding and around 10\times of single-sequence speculative decoding.
%R 10.18653/v1/2024.findings-acl.489
%U https://aclanthology.org/2024.findings-acl.489
%U https://doi.org/10.18653/v1/2024.findings-acl.489
%P 8214-8224
Markdown (Informal)
[BASS: Batched Attention-optimized Speculative Sampling](https://aclanthology.org/2024.findings-acl.489) (Qian et al., Findings 2024)
ACL
- Haifeng Qian, Sujan Kumar Gonugondla, Sungsoo Ha, Mingyue Shang, Sanjay Krishna Gouda, Ramesh Nallapati, Sudipta Sengupta, Xiaofei Ma, and Anoop Deoras. 2024. BASS: Batched Attention-optimized Speculative Sampling. In Findings of the Association for Computational Linguistics: ACL 2024, pages 8214–8224, Bangkok, Thailand. Association for Computational Linguistics.