@inproceedings{zhu-etal-2026-ojakv,
title = "{O}ja{KV}: Context-Aware Online Low-Rank {KV} Cache Compression",
author = "Zhu, Yuxuan and
Yang, David H. and
Amiri, Mohammad Mohammadi and
Murugesan, Keerthiram and
Pedapati, Tejaswini and
Chen, Pin-Yu",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.494/",
pages = "10161--10178",
ISBN = "979-8-89176-395-1",
abstract = "The expanding long-context capabilities of large language models are constrained by a significant memory bottleneck: the key-value (KV) cache required for autoregressive generation. This bottleneck is substantial; for instance, a Llama-3.1-8B model processing a 32K-token prompt at a batch size of 4 requires approximately 16 GB for its KV cache, exceeding the model{'}s weights. While KV-cache compression via low-rank projection is promising, existing methods rely on a static, offline-learned subspace that performs poorly under distribution shifts. To overcome these limitations, we introduce OjaKV, a novel framework integrating a hybrid storage policy with online subspace adaptation. OjaKV preserves crucial tokens in full rank as high-fidelity anchors, while applying low-rank compression to intermediate tokens by adapting the projection basis using Oja{'}s algorithm for online PCA. This adaptation involves a comprehensive update during prefilling and lightweight periodic updates during decoding, ensuring the subspace remains aligned with evolving context. Our framework is fully compatible with FlashAttention. Experiments demonstrate that OjaKV maintains or improves zero-shot accuracy at high compression ratios, achieving the strongest gains on long-context benchmarks requiring complex reasoning. Furthermore, our approach combines with token-selection methods for compounded memory savings, establishing a practical, plug-and-play solution for memory-efficient long-context inference without fine-tuning."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhu-etal-2026-ojakv">
<titleInfo>
<title>OjaKV: Context-Aware Online Low-Rank KV Cache Compression</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuxuan</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">H</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Mohammadi</namePart>
<namePart type="family">Amiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keerthiram</namePart>
<namePart type="family">Murugesan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tejaswini</namePart>
<namePart type="family">Pedapati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pin-Yu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>The expanding long-context capabilities of large language models are constrained by a significant memory bottleneck: the key-value (KV) cache required for autoregressive generation. This bottleneck is substantial; for instance, a Llama-3.1-8B model processing a 32K-token prompt at a batch size of 4 requires approximately 16 GB for its KV cache, exceeding the model’s weights. While KV-cache compression via low-rank projection is promising, existing methods rely on a static, offline-learned subspace that performs poorly under distribution shifts. To overcome these limitations, we introduce OjaKV, a novel framework integrating a hybrid storage policy with online subspace adaptation. OjaKV preserves crucial tokens in full rank as high-fidelity anchors, while applying low-rank compression to intermediate tokens by adapting the projection basis using Oja’s algorithm for online PCA. This adaptation involves a comprehensive update during prefilling and lightweight periodic updates during decoding, ensuring the subspace remains aligned with evolving context. Our framework is fully compatible with FlashAttention. Experiments demonstrate that OjaKV maintains or improves zero-shot accuracy at high compression ratios, achieving the strongest gains on long-context benchmarks requiring complex reasoning. Furthermore, our approach combines with token-selection methods for compounded memory savings, establishing a practical, plug-and-play solution for memory-efficient long-context inference without fine-tuning.</abstract>
<identifier type="citekey">zhu-etal-2026-ojakv</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.494/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>10161</start>
<end>10178</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T OjaKV: Context-Aware Online Low-Rank KV Cache Compression
%A Zhu, Yuxuan
%A Yang, David H.
%A Amiri, Mohammad Mohammadi
%A Murugesan, Keerthiram
%A Pedapati, Tejaswini
%A Chen, Pin-Yu
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F zhu-etal-2026-ojakv
%X The expanding long-context capabilities of large language models are constrained by a significant memory bottleneck: the key-value (KV) cache required for autoregressive generation. This bottleneck is substantial; for instance, a Llama-3.1-8B model processing a 32K-token prompt at a batch size of 4 requires approximately 16 GB for its KV cache, exceeding the model’s weights. While KV-cache compression via low-rank projection is promising, existing methods rely on a static, offline-learned subspace that performs poorly under distribution shifts. To overcome these limitations, we introduce OjaKV, a novel framework integrating a hybrid storage policy with online subspace adaptation. OjaKV preserves crucial tokens in full rank as high-fidelity anchors, while applying low-rank compression to intermediate tokens by adapting the projection basis using Oja’s algorithm for online PCA. This adaptation involves a comprehensive update during prefilling and lightweight periodic updates during decoding, ensuring the subspace remains aligned with evolving context. Our framework is fully compatible with FlashAttention. Experiments demonstrate that OjaKV maintains or improves zero-shot accuracy at high compression ratios, achieving the strongest gains on long-context benchmarks requiring complex reasoning. Furthermore, our approach combines with token-selection methods for compounded memory savings, establishing a practical, plug-and-play solution for memory-efficient long-context inference without fine-tuning.
%U https://aclanthology.org/2026.findings-acl.494/
%P 10161-10178
Markdown (Informal)
[OjaKV: Context-Aware Online Low-Rank KV Cache Compression](https://aclanthology.org/2026.findings-acl.494/) (Zhu et al., Findings 2026)
ACL
- Yuxuan Zhu, David H. Yang, Mohammad Mohammadi Amiri, Keerthiram Murugesan, Tejaswini Pedapati, and Pin-Yu Chen. 2026. OjaKV: Context-Aware Online Low-Rank KV Cache Compression. In Findings of the Association for Computational Linguistics: ACL 2026, pages 10161–10178, San Diego, California, United States. Association for Computational Linguistics.