@article{lee-etal-2025-tale,
title = "{TALE}: Token-Adaptive Low-Rank {KVC}ache Approximation with Reconstruction Elimination",
author = "Lee, Jaeseong and
Hwang, Seung-won and
Qiao, Aurick and
Campos, Daniel and
Yao, Zhewei and
He, Yuxiong",
journal = "Transactions of the Association for Computational Linguistics",
volume = "13",
year = "2025",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2025.tacl-1.59/",
doi = "10.1162/tacl.a.39",
pages = "1298--1318",
abstract = "KVCache, by storing key-value pairs for reuse, has been crucial for enhancing inference efficiency for large language models (LLMs). However, the increasing memory demands of KVCache, especially with recent trends of longer input sequences, present a major challenge. In this work, we propose an innovative token-adaptive low-rank approximation strategy for KVCache compression. By applying varying ranks based on token significance, our method compresses KVCache efficiently while retaining critical information. Moreover, we introduce a lazy approximation technique, which approximates lazily only when needed, alongside a reconstruction-free design to bypass costly recalculations. Combined with multi-level quantization, this method reduces KVCache size by 9.1{\texttimes} on the Llama-3.1-8B model, with minimal performance degradation on complex tasks such as GSM8K. Moreover, our custom attention implementation shows up to 2{\texttimes} latency reduction compared to the conventional method in long context scenarios. The code is publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lee-etal-2025-tale">
<titleInfo>
<title>TALE: Token-Adaptive Low-Rank KVCache Approximation with Reconstruction Elimination</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jaeseong</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seung-won</namePart>
<namePart type="family">Hwang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aurick</namePart>
<namePart type="family">Qiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Campos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhewei</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuxiong</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>KVCache, by storing key-value pairs for reuse, has been crucial for enhancing inference efficiency for large language models (LLMs). However, the increasing memory demands of KVCache, especially with recent trends of longer input sequences, present a major challenge. In this work, we propose an innovative token-adaptive low-rank approximation strategy for KVCache compression. By applying varying ranks based on token significance, our method compresses KVCache efficiently while retaining critical information. Moreover, we introduce a lazy approximation technique, which approximates lazily only when needed, alongside a reconstruction-free design to bypass costly recalculations. Combined with multi-level quantization, this method reduces KVCache size by 9.1× on the Llama-3.1-8B model, with minimal performance degradation on complex tasks such as GSM8K. Moreover, our custom attention implementation shows up to 2× latency reduction compared to the conventional method in long context scenarios. The code is publicly available.</abstract>
<identifier type="citekey">lee-etal-2025-tale</identifier>
<identifier type="doi">10.1162/tacl.a.39</identifier>
<location>
<url>https://aclanthology.org/2025.tacl-1.59/</url>
</location>
<part>
<date>2025</date>
<detail type="volume"><number>13</number></detail>
<extent unit="page">
<start>1298</start>
<end>1318</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T TALE: Token-Adaptive Low-Rank KVCache Approximation with Reconstruction Elimination
%A Lee, Jaeseong
%A Hwang, Seung-won
%A Qiao, Aurick
%A Campos, Daniel
%A Yao, Zhewei
%A He, Yuxiong
%J Transactions of the Association for Computational Linguistics
%D 2025
%V 13
%I MIT Press
%C Cambridge, MA
%F lee-etal-2025-tale
%X KVCache, by storing key-value pairs for reuse, has been crucial for enhancing inference efficiency for large language models (LLMs). However, the increasing memory demands of KVCache, especially with recent trends of longer input sequences, present a major challenge. In this work, we propose an innovative token-adaptive low-rank approximation strategy for KVCache compression. By applying varying ranks based on token significance, our method compresses KVCache efficiently while retaining critical information. Moreover, we introduce a lazy approximation technique, which approximates lazily only when needed, alongside a reconstruction-free design to bypass costly recalculations. Combined with multi-level quantization, this method reduces KVCache size by 9.1× on the Llama-3.1-8B model, with minimal performance degradation on complex tasks such as GSM8K. Moreover, our custom attention implementation shows up to 2× latency reduction compared to the conventional method in long context scenarios. The code is publicly available.
%R 10.1162/tacl.a.39
%U https://aclanthology.org/2025.tacl-1.59/
%U https://doi.org/10.1162/tacl.a.39
%P 1298-1318
Markdown (Informal)
[TALE: Token-Adaptive Low-Rank KVCache Approximation with Reconstruction Elimination](https://aclanthology.org/2025.tacl-1.59/) (Lee et al., TACL 2025)
ACL