@inproceedings{jo-etal-2026-fastkv,
title = "{F}ast{KV}: Decoupling of Context Reduction and {KV} Cache Compression for Prefill-Decoding Acceleration",
author = "Jo, Dongwon and
Song, Jiwon and
Kim, Yulhwa and
Kim, Jae-Joon",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1610/",
pages = "32167--32186",
ISBN = "979-8-89176-395-1",
abstract = "While large language models (LLMs) excel at handling long-context sequences, they require substantial prefill computation and key-value (KV) cache, which can heavily burden computational efficiency and memory usage in both prefill and decoding stages.Recent works that compress KV caches with prefill acceleration reduce this cost but inadvertently tie the prefill compute reduction to the decoding KV budget. This coupling arises from overlooking the layer-dependent variation of critical context, often leading to accuracy degradation. To address this issue, we introduce FastKV, a KV cache compression framework designed to reduce latency in both prefill and decoding by leveraging the stabilization of token importance in later layers.FastKV performs full-context computation until a Token-Selective Propagation (TSP) layer, which forwards only the most informative tokens to subsequent layers.From these propagated tokens, FastKV independently selects salient KV entries for caching, thereby decoupling KV budget from the prefill compute reduction based on the TSP decision.This independent control of the TSP rate and KV retention rate enables flexible optimization of efficiency and accuracy.Experimental results show that FastKV achieves speedups of up to 1.82$\times$ in prefill and 2.87$\times$ in decoding compared to the full-context baseline, while matching the accuracy of the decoding-only baselines.Our code is available at https://github.com/dongwonjo/FastKV."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jo-etal-2026-fastkv">
<titleInfo>
<title>FastKV: Decoupling of Context Reduction and KV Cache Compression for Prefill-Decoding Acceleration</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dongwon</namePart>
<namePart type="family">Jo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiwon</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulhwa</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jae-Joon</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>While large language models (LLMs) excel at handling long-context sequences, they require substantial prefill computation and key-value (KV) cache, which can heavily burden computational efficiency and memory usage in both prefill and decoding stages.Recent works that compress KV caches with prefill acceleration reduce this cost but inadvertently tie the prefill compute reduction to the decoding KV budget. This coupling arises from overlooking the layer-dependent variation of critical context, often leading to accuracy degradation. To address this issue, we introduce FastKV, a KV cache compression framework designed to reduce latency in both prefill and decoding by leveraging the stabilization of token importance in later layers.FastKV performs full-context computation until a Token-Selective Propagation (TSP) layer, which forwards only the most informative tokens to subsequent layers.From these propagated tokens, FastKV independently selects salient KV entries for caching, thereby decoupling KV budget from the prefill compute reduction based on the TSP decision.This independent control of the TSP rate and KV retention rate enables flexible optimization of efficiency and accuracy.Experimental results show that FastKV achieves speedups of up to 1.82\times in prefill and 2.87\times in decoding compared to the full-context baseline, while matching the accuracy of the decoding-only baselines.Our code is available at https://github.com/dongwonjo/FastKV.</abstract>
<identifier type="citekey">jo-etal-2026-fastkv</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1610/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>32167</start>
<end>32186</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FastKV: Decoupling of Context Reduction and KV Cache Compression for Prefill-Decoding Acceleration
%A Jo, Dongwon
%A Song, Jiwon
%A Kim, Yulhwa
%A Kim, Jae-Joon
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F jo-etal-2026-fastkv
%X While large language models (LLMs) excel at handling long-context sequences, they require substantial prefill computation and key-value (KV) cache, which can heavily burden computational efficiency and memory usage in both prefill and decoding stages.Recent works that compress KV caches with prefill acceleration reduce this cost but inadvertently tie the prefill compute reduction to the decoding KV budget. This coupling arises from overlooking the layer-dependent variation of critical context, often leading to accuracy degradation. To address this issue, we introduce FastKV, a KV cache compression framework designed to reduce latency in both prefill and decoding by leveraging the stabilization of token importance in later layers.FastKV performs full-context computation until a Token-Selective Propagation (TSP) layer, which forwards only the most informative tokens to subsequent layers.From these propagated tokens, FastKV independently selects salient KV entries for caching, thereby decoupling KV budget from the prefill compute reduction based on the TSP decision.This independent control of the TSP rate and KV retention rate enables flexible optimization of efficiency and accuracy.Experimental results show that FastKV achieves speedups of up to 1.82\times in prefill and 2.87\times in decoding compared to the full-context baseline, while matching the accuracy of the decoding-only baselines.Our code is available at https://github.com/dongwonjo/FastKV.
%U https://aclanthology.org/2026.findings-acl.1610/
%P 32167-32186
Markdown (Informal)
[FastKV: Decoupling of Context Reduction and KV Cache Compression for Prefill-Decoding Acceleration](https://aclanthology.org/2026.findings-acl.1610/) (Jo et al., Findings 2026)
ACL