@inproceedings{zhang-etal-2026-lazyeviction,
title = "{L}azy{E}viction: Lagged {KV} Eviction with Attention Pattern Observation for Efficient Long Reasoning",
author = "Zhang, Haoyue and
Zhang, Hualei and
Ma, Xiaosong and
Zhang, Jie and
Guo, Song",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1683/",
pages = "36335--36352",
ISBN = "979-8-89176-390-6",
abstract = "Large Language Models (LLMs) exhibit enhanced capabilities by Chain-of-Thought reasoning. However, the extended reasoning sequences introduce significant GPU memory overhead due to increased key-value (KV) cache. Existing KV cache compression methods mitigate memory bottlenecks but struggle in long reasoning tasks. In this paper, we analyze attention patterns in reasoning tasks and reveal a **Token Importance Recurrence** phenomenon: a large proportion of tokens regain high attention after multiple decoding steps, which is failed to capture by existing works and may lead to unpredictable eviction on such periodically critical tokens. To address this, we propose **LazyEviction**, an observation window-based lagged eviction framework retaining latent recurring tokens by prioritized eviction based on tokens' recurrence patterns. Extensive experiments demonstrate that LazyEviction reduces KV cache by 50{\%}{~}70{\%} while maintaining comparable accuracy, outperforming existing KV cache baselines. Our implementation code can be found at https://github.com/Halo-949/LazyEviction."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2026-lazyeviction">
<titleInfo>
<title>LazyEviction: Lagged KV Eviction with Attention Pattern Observation for Efficient Long Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haoyue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hualei</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaosong</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jie</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Song</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) exhibit enhanced capabilities by Chain-of-Thought reasoning. However, the extended reasoning sequences introduce significant GPU memory overhead due to increased key-value (KV) cache. Existing KV cache compression methods mitigate memory bottlenecks but struggle in long reasoning tasks. In this paper, we analyze attention patterns in reasoning tasks and reveal a **Token Importance Recurrence** phenomenon: a large proportion of tokens regain high attention after multiple decoding steps, which is failed to capture by existing works and may lead to unpredictable eviction on such periodically critical tokens. To address this, we propose **LazyEviction**, an observation window-based lagged eviction framework retaining latent recurring tokens by prioritized eviction based on tokens’ recurrence patterns. Extensive experiments demonstrate that LazyEviction reduces KV cache by 50% 70% while maintaining comparable accuracy, outperforming existing KV cache baselines. Our implementation code can be found at https://github.com/Halo-949/LazyEviction.</abstract>
<identifier type="citekey">zhang-etal-2026-lazyeviction</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1683/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>36335</start>
<end>36352</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LazyEviction: Lagged KV Eviction with Attention Pattern Observation for Efficient Long Reasoning
%A Zhang, Haoyue
%A Zhang, Hualei
%A Ma, Xiaosong
%A Zhang, Jie
%A Guo, Song
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F zhang-etal-2026-lazyeviction
%X Large Language Models (LLMs) exhibit enhanced capabilities by Chain-of-Thought reasoning. However, the extended reasoning sequences introduce significant GPU memory overhead due to increased key-value (KV) cache. Existing KV cache compression methods mitigate memory bottlenecks but struggle in long reasoning tasks. In this paper, we analyze attention patterns in reasoning tasks and reveal a **Token Importance Recurrence** phenomenon: a large proportion of tokens regain high attention after multiple decoding steps, which is failed to capture by existing works and may lead to unpredictable eviction on such periodically critical tokens. To address this, we propose **LazyEviction**, an observation window-based lagged eviction framework retaining latent recurring tokens by prioritized eviction based on tokens’ recurrence patterns. Extensive experiments demonstrate that LazyEviction reduces KV cache by 50% 70% while maintaining comparable accuracy, outperforming existing KV cache baselines. Our implementation code can be found at https://github.com/Halo-949/LazyEviction.
%U https://aclanthology.org/2026.acl-long.1683/
%P 36335-36352
Markdown (Informal)
[LazyEviction: Lagged KV Eviction with Attention Pattern Observation for Efficient Long Reasoning](https://aclanthology.org/2026.acl-long.1683/) (Zhang et al., ACL 2026)
ACL