@inproceedings{zhou-etal-2025-dynamickv,
title = "{D}ynamic{KV}: Task-Aware Adaptive {KV} Cache Compression for Long Context {LLM}s",
author = "Zhou, Xiabin and
Wang, Wenbin and
Zeng, Minyan and
Guo, Jiaxian and
Liu, Xuebo and
Shen, Li and
Zhang, Min and
Ding, Liang",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.426/",
doi = "10.18653/v1/2025.findings-emnlp.426",
pages = "8042--8057",
ISBN = "979-8-89176-335-7",
abstract = "Efficiently managing the KV cache in Large Language Models (LLMs) is a critical challenge for long-context processing tasks such as retrieval-augmented generation (RAG), long text summarization, and multi-document analysis. Extending the context length substantially increases the KV cache size, leading to excessive memory consumption. Existing KV cache compression methods enforce a fixed pattern, neglecting task-specific characteristics, which hampers the effective retention of essential information while discarding less important tokens. In this paper, we introduce a novel Task-Aware KV cache mechanism that dynamically adjusts the KV cache size across different layers based on the characteristics of the tasks. Our approach builds on the significant observation of distinct activation patterns across layers in various tasks, which highlights the need for adaptive strategies tailored to each task{'}s unique demands. Based on this insight, we propose DynamicKV, a method that dynamically optimizes token retention by adjusting the number of tokens retained at each layer, adapting to the specific task. DynamicKV establishes global and per-layer maximum KV cache budgets, temporarily retaining the maximum budget for the current layer, and periodically updating the KV cache sizes of all preceding layers during inference. Our method demonstrates exceptional performance on the LongBench dataset, retaining only 1.7{\%} of the KV cache while preserving 90{\%}, 87{\%}, 78{\%}, and 83{\%} of the original accuracy for LlaMA-3-8B-Instruct, Mistral-7B-Instruct-v0.2, Qwen2-7B-Instruct, and InternLM-2.5-7B-Chat-1M, respectively. When the retained KV cache size is increased to 6.9{\%}, the performance becomes nearly indistinguishable from that without any KV cache compression. Notably, even under extreme compression (0.9{\%}), DynamicKV surpasses state-of-the-art (SOTA) methods by 11{\%} in the Needle-in-a-Haystack test using Mistral-7B-Instruct-v0.2. The code is available at repository https://github.com/DreamMr/DynamicK."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhou-etal-2025-dynamickv">
<titleInfo>
<title>DynamicKV: Task-Aware Adaptive KV Cache Compression for Long Context LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiabin</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenbin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minyan</namePart>
<namePart type="family">Zeng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaxian</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuebo</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liang</namePart>
<namePart type="family">Ding</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Efficiently managing the KV cache in Large Language Models (LLMs) is a critical challenge for long-context processing tasks such as retrieval-augmented generation (RAG), long text summarization, and multi-document analysis. Extending the context length substantially increases the KV cache size, leading to excessive memory consumption. Existing KV cache compression methods enforce a fixed pattern, neglecting task-specific characteristics, which hampers the effective retention of essential information while discarding less important tokens. In this paper, we introduce a novel Task-Aware KV cache mechanism that dynamically adjusts the KV cache size across different layers based on the characteristics of the tasks. Our approach builds on the significant observation of distinct activation patterns across layers in various tasks, which highlights the need for adaptive strategies tailored to each task’s unique demands. Based on this insight, we propose DynamicKV, a method that dynamically optimizes token retention by adjusting the number of tokens retained at each layer, adapting to the specific task. DynamicKV establishes global and per-layer maximum KV cache budgets, temporarily retaining the maximum budget for the current layer, and periodically updating the KV cache sizes of all preceding layers during inference. Our method demonstrates exceptional performance on the LongBench dataset, retaining only 1.7% of the KV cache while preserving 90%, 87%, 78%, and 83% of the original accuracy for LlaMA-3-8B-Instruct, Mistral-7B-Instruct-v0.2, Qwen2-7B-Instruct, and InternLM-2.5-7B-Chat-1M, respectively. When the retained KV cache size is increased to 6.9%, the performance becomes nearly indistinguishable from that without any KV cache compression. Notably, even under extreme compression (0.9%), DynamicKV surpasses state-of-the-art (SOTA) methods by 11% in the Needle-in-a-Haystack test using Mistral-7B-Instruct-v0.2. The code is available at repository https://github.com/DreamMr/DynamicK.</abstract>
<identifier type="citekey">zhou-etal-2025-dynamickv</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.426</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.426/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>8042</start>
<end>8057</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DynamicKV: Task-Aware Adaptive KV Cache Compression for Long Context LLMs
%A Zhou, Xiabin
%A Wang, Wenbin
%A Zeng, Minyan
%A Guo, Jiaxian
%A Liu, Xuebo
%A Shen, Li
%A Zhang, Min
%A Ding, Liang
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F zhou-etal-2025-dynamickv
%X Efficiently managing the KV cache in Large Language Models (LLMs) is a critical challenge for long-context processing tasks such as retrieval-augmented generation (RAG), long text summarization, and multi-document analysis. Extending the context length substantially increases the KV cache size, leading to excessive memory consumption. Existing KV cache compression methods enforce a fixed pattern, neglecting task-specific characteristics, which hampers the effective retention of essential information while discarding less important tokens. In this paper, we introduce a novel Task-Aware KV cache mechanism that dynamically adjusts the KV cache size across different layers based on the characteristics of the tasks. Our approach builds on the significant observation of distinct activation patterns across layers in various tasks, which highlights the need for adaptive strategies tailored to each task’s unique demands. Based on this insight, we propose DynamicKV, a method that dynamically optimizes token retention by adjusting the number of tokens retained at each layer, adapting to the specific task. DynamicKV establishes global and per-layer maximum KV cache budgets, temporarily retaining the maximum budget for the current layer, and periodically updating the KV cache sizes of all preceding layers during inference. Our method demonstrates exceptional performance on the LongBench dataset, retaining only 1.7% of the KV cache while preserving 90%, 87%, 78%, and 83% of the original accuracy for LlaMA-3-8B-Instruct, Mistral-7B-Instruct-v0.2, Qwen2-7B-Instruct, and InternLM-2.5-7B-Chat-1M, respectively. When the retained KV cache size is increased to 6.9%, the performance becomes nearly indistinguishable from that without any KV cache compression. Notably, even under extreme compression (0.9%), DynamicKV surpasses state-of-the-art (SOTA) methods by 11% in the Needle-in-a-Haystack test using Mistral-7B-Instruct-v0.2. The code is available at repository https://github.com/DreamMr/DynamicK.
%R 10.18653/v1/2025.findings-emnlp.426
%U https://aclanthology.org/2025.findings-emnlp.426/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.426
%P 8042-8057
Markdown (Informal)
[DynamicKV: Task-Aware Adaptive KV Cache Compression for Long Context LLMs](https://aclanthology.org/2025.findings-emnlp.426/) (Zhou et al., Findings 2025)
ACL
- Xiabin Zhou, Wenbin Wang, Minyan Zeng, Jiaxian Guo, Xuebo Liu, Li Shen, Min Zhang, and Liang Ding. 2025. DynamicKV: Task-Aware Adaptive KV Cache Compression for Long Context LLMs. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 8042–8057, Suzhou, China. Association for Computational Linguistics.