@inproceedings{zhang-etal-2025-clusterattn,
title = "{C}luster{A}ttn: {KV} Cache Compression under Intrinsic Attention Clustering",
author = "Zhang, Minwei and
Sun, Haifeng and
Wang, Jingyu and
Li, Shaolong and
Ning, Wanyi and
Qi, Qi and
Zhuang, Zirui and
Liao, Jianxin",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.703/",
doi = "10.18653/v1/2025.acl-long.703",
pages = "14451--14473",
ISBN = "979-8-89176-251-0",
abstract = "Sparse attention can effectively alleviate the significant demands on memory when large language models (LLMs) process long contexts. Existing methods typically apply the same sparse pattern across different attention heads and inputs. However, this uniform approach fails to capture the inherent diversity of attention patterns within LLMs {---} the intrinsic attention clustering. To address this, we propose ClusterAttn, a training-free sparse attention method that provides an efficient prompt cache compression scheme under intrinsic attention clustering for efficient LLM inference.Our findings show that attention heads consistently focus on specific clusters of the prompt during decoding, a pattern detectable from an observation window at the prompt{'}s end. ClusterAttn adaptively fits these clusters utilizing a density-based attention clustering algorithm, thus compressing the KV cache of the prompt. Evaluations on different models across various benchmarks demonstrate ClusterAttn{'}s superior compression rates and efficiency. By utilizing only 1024 tokens, it can reduce memory usage by 10{\%}{--}65{\%}, resulting in a latency reduction of 12{\%}{--}23{\%} and a throughput increase of 2.6{--}4.8 times, all with nearly no accuracy loss. Additionally, ClusterAttn can handle up to 128k context on a single A100-80GB GPU, outperforming existing methods."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2025-clusterattn">
<titleInfo>
<title>ClusterAttn: KV Cache Compression under Intrinsic Attention Clustering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Minwei</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haifeng</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingyu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shaolong</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wanyi</namePart>
<namePart type="family">Ning</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qi</namePart>
<namePart type="family">Qi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zirui</namePart>
<namePart type="family">Zhuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianxin</namePart>
<namePart type="family">Liao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Sparse attention can effectively alleviate the significant demands on memory when large language models (LLMs) process long contexts. Existing methods typically apply the same sparse pattern across different attention heads and inputs. However, this uniform approach fails to capture the inherent diversity of attention patterns within LLMs — the intrinsic attention clustering. To address this, we propose ClusterAttn, a training-free sparse attention method that provides an efficient prompt cache compression scheme under intrinsic attention clustering for efficient LLM inference.Our findings show that attention heads consistently focus on specific clusters of the prompt during decoding, a pattern detectable from an observation window at the prompt’s end. ClusterAttn adaptively fits these clusters utilizing a density-based attention clustering algorithm, thus compressing the KV cache of the prompt. Evaluations on different models across various benchmarks demonstrate ClusterAttn’s superior compression rates and efficiency. By utilizing only 1024 tokens, it can reduce memory usage by 10%–65%, resulting in a latency reduction of 12%–23% and a throughput increase of 2.6–4.8 times, all with nearly no accuracy loss. Additionally, ClusterAttn can handle up to 128k context on a single A100-80GB GPU, outperforming existing methods.</abstract>
<identifier type="citekey">zhang-etal-2025-clusterattn</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.703</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.703/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>14451</start>
<end>14473</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ClusterAttn: KV Cache Compression under Intrinsic Attention Clustering
%A Zhang, Minwei
%A Sun, Haifeng
%A Wang, Jingyu
%A Li, Shaolong
%A Ning, Wanyi
%A Qi, Qi
%A Zhuang, Zirui
%A Liao, Jianxin
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F zhang-etal-2025-clusterattn
%X Sparse attention can effectively alleviate the significant demands on memory when large language models (LLMs) process long contexts. Existing methods typically apply the same sparse pattern across different attention heads and inputs. However, this uniform approach fails to capture the inherent diversity of attention patterns within LLMs — the intrinsic attention clustering. To address this, we propose ClusterAttn, a training-free sparse attention method that provides an efficient prompt cache compression scheme under intrinsic attention clustering for efficient LLM inference.Our findings show that attention heads consistently focus on specific clusters of the prompt during decoding, a pattern detectable from an observation window at the prompt’s end. ClusterAttn adaptively fits these clusters utilizing a density-based attention clustering algorithm, thus compressing the KV cache of the prompt. Evaluations on different models across various benchmarks demonstrate ClusterAttn’s superior compression rates and efficiency. By utilizing only 1024 tokens, it can reduce memory usage by 10%–65%, resulting in a latency reduction of 12%–23% and a throughput increase of 2.6–4.8 times, all with nearly no accuracy loss. Additionally, ClusterAttn can handle up to 128k context on a single A100-80GB GPU, outperforming existing methods.
%R 10.18653/v1/2025.acl-long.703
%U https://aclanthology.org/2025.acl-long.703/
%U https://doi.org/10.18653/v1/2025.acl-long.703
%P 14451-14473
Markdown (Informal)
[ClusterAttn: KV Cache Compression under Intrinsic Attention Clustering](https://aclanthology.org/2025.acl-long.703/) (Zhang et al., ACL 2025)
ACL
- Minwei Zhang, Haifeng Sun, Jingyu Wang, Shaolong Li, Wanyi Ning, Qi Qi, Zirui Zhuang, and Jianxin Liao. 2025. ClusterAttn: KV Cache Compression under Intrinsic Attention Clustering. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 14451–14473, Vienna, Austria. Association for Computational Linguistics.