@inproceedings{zhang-etal-2025-leank,
title = "{L}ean{K}: Learnable K Cache Channel Pruning for Efficient Decoding",
author = "Zhang, Yike and
He, Zhiyuan and
Jiang, Huiqiang and
Zhang, Chengruidong and
Yang, Yuqing and
Wang, Jianyong and
Qiu, Lili",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1584/",
doi = "10.18653/v1/2025.emnlp-main.1584",
pages = "31122--31137",
ISBN = "979-8-89176-332-6",
abstract = "Large language models (LLMs) enable long-context tasks but face efficiency challenges due to the growing key-value (KV) cache. We propose LeanK, a learning-based method that prunes unimportant key (K) cache channels by leveraging static channel sparsity. LeanK reduces GPU memory and accelerates decoding without sacrificing accuracy. Experiments demonstrate up to 70{\%} K cache and 16{\%}{--}18{\%} V cache memory reduction, and 1.45{\texttimes} decoding speedup. We also provide insights into model channels and attention heads during long-context inference by analyzing the learned importance distribution. Our code is anonymously available at https://anonymous.4open.science/r/LeanK-7A87/README.md."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2025-leank">
<titleInfo>
<title>LeanK: Learnable K Cache Channel Pruning for Efficient Decoding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yike</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyuan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huiqiang</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengruidong</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuqing</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianyong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lili</namePart>
<namePart type="family">Qiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Large language models (LLMs) enable long-context tasks but face efficiency challenges due to the growing key-value (KV) cache. We propose LeanK, a learning-based method that prunes unimportant key (K) cache channels by leveraging static channel sparsity. LeanK reduces GPU memory and accelerates decoding without sacrificing accuracy. Experiments demonstrate up to 70% K cache and 16%–18% V cache memory reduction, and 1.45× decoding speedup. We also provide insights into model channels and attention heads during long-context inference by analyzing the learned importance distribution. Our code is anonymously available at https://anonymous.4open.science/r/LeanK-7A87/README.md.</abstract>
<identifier type="citekey">zhang-etal-2025-leank</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.1584</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1584/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>31122</start>
<end>31137</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LeanK: Learnable K Cache Channel Pruning for Efficient Decoding
%A Zhang, Yike
%A He, Zhiyuan
%A Jiang, Huiqiang
%A Zhang, Chengruidong
%A Yang, Yuqing
%A Wang, Jianyong
%A Qiu, Lili
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F zhang-etal-2025-leank
%X Large language models (LLMs) enable long-context tasks but face efficiency challenges due to the growing key-value (KV) cache. We propose LeanK, a learning-based method that prunes unimportant key (K) cache channels by leveraging static channel sparsity. LeanK reduces GPU memory and accelerates decoding without sacrificing accuracy. Experiments demonstrate up to 70% K cache and 16%–18% V cache memory reduction, and 1.45× decoding speedup. We also provide insights into model channels and attention heads during long-context inference by analyzing the learned importance distribution. Our code is anonymously available at https://anonymous.4open.science/r/LeanK-7A87/README.md.
%R 10.18653/v1/2025.emnlp-main.1584
%U https://aclanthology.org/2025.emnlp-main.1584/
%U https://doi.org/10.18653/v1/2025.emnlp-main.1584
%P 31122-31137
Markdown (Informal)
[LeanK: Learnable K Cache Channel Pruning for Efficient Decoding](https://aclanthology.org/2025.emnlp-main.1584/) (Zhang et al., EMNLP 2025)
ACL
- Yike Zhang, Zhiyuan He, Huiqiang Jiang, Chengruidong Zhang, Yuqing Yang, Jianyong Wang, and Lili Qiu. 2025. LeanK: Learnable K Cache Channel Pruning for Efficient Decoding. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 31122–31137, Suzhou, China. Association for Computational Linguistics.