@inproceedings{ramachandran-etal-2026-equip,
title = "{EQUIP}: {EQU}ivariant preserving In-Place updates for Efficient Token Pruning",
author = "Ramachandran, Arun and
Govindarajan, R and
Annavaram, Murali and
Raghavendra, Prakash",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1210/",
pages = "26303--26323",
ISBN = "979-8-89176-390-6",
abstract = "Token-pruning has emerged as a primary focus in large language models (LLMs) to enhance model efficiency while preserving accuracy, especially for large sequence lengths. However, the eviction operation of token-pruning methods causes ``holes'' in KV tensors, posing two major challenges: (1) The shift operation, required to make the KV tensor contiguous, results in significant copy overheads; (2) The changes in position indices due to token eviction lead to increased computational requirements for Rotary Positional Encoding (RoPE). To address these issues, we introduce EQUIP, an EQUivariant preserving in-place token update mechanism that ensures the equivariance property of the operations performed in the attention computation. EQUIP offers two fundamental advantages: First, it combines eviction and a subsequent token insertion into an in-place replacement operation, which reduces the KV cache copy overheads significantly. Second, EQUIP reduces recomputation of rotation operations through a combination of in-place update, caching and a re-indexing strategy. Together, these optimizations enable EQUIP to achieve geomean speedups of 1.62{\texttimes} (or 1.47{\texttimes}) on CPU (GPU) over StreamingLLM, and 3.45{\texttimes} (or 1.86{\texttimes}) on CPU (GPU) over Heavy Hitters (H2O). EQUIP with Paged Attention achieves speedups of 4.18{\texttimes}(2.61{\texttimes}) on CPU (GPU) over auto-regressive baselines. EQUIP matches the model accuracy of baseline pruning methods while delivering superior performance."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ramachandran-etal-2026-equip">
<titleInfo>
<title>EQUIP: EQUivariant preserving In-Place updates for Efficient Token Pruning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arun</namePart>
<namePart type="family">Ramachandran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">R</namePart>
<namePart type="family">Govindarajan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Murali</namePart>
<namePart type="family">Annavaram</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prakash</namePart>
<namePart type="family">Raghavendra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Token-pruning has emerged as a primary focus in large language models (LLMs) to enhance model efficiency while preserving accuracy, especially for large sequence lengths. However, the eviction operation of token-pruning methods causes “holes” in KV tensors, posing two major challenges: (1) The shift operation, required to make the KV tensor contiguous, results in significant copy overheads; (2) The changes in position indices due to token eviction lead to increased computational requirements for Rotary Positional Encoding (RoPE). To address these issues, we introduce EQUIP, an EQUivariant preserving in-place token update mechanism that ensures the equivariance property of the operations performed in the attention computation. EQUIP offers two fundamental advantages: First, it combines eviction and a subsequent token insertion into an in-place replacement operation, which reduces the KV cache copy overheads significantly. Second, EQUIP reduces recomputation of rotation operations through a combination of in-place update, caching and a re-indexing strategy. Together, these optimizations enable EQUIP to achieve geomean speedups of 1.62× (or 1.47×) on CPU (GPU) over StreamingLLM, and 3.45× (or 1.86×) on CPU (GPU) over Heavy Hitters (H2O). EQUIP with Paged Attention achieves speedups of 4.18×(2.61×) on CPU (GPU) over auto-regressive baselines. EQUIP matches the model accuracy of baseline pruning methods while delivering superior performance.</abstract>
<identifier type="citekey">ramachandran-etal-2026-equip</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1210/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>26303</start>
<end>26323</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T EQUIP: EQUivariant preserving In-Place updates for Efficient Token Pruning
%A Ramachandran, Arun
%A Govindarajan, R.
%A Annavaram, Murali
%A Raghavendra, Prakash
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F ramachandran-etal-2026-equip
%X Token-pruning has emerged as a primary focus in large language models (LLMs) to enhance model efficiency while preserving accuracy, especially for large sequence lengths. However, the eviction operation of token-pruning methods causes “holes” in KV tensors, posing two major challenges: (1) The shift operation, required to make the KV tensor contiguous, results in significant copy overheads; (2) The changes in position indices due to token eviction lead to increased computational requirements for Rotary Positional Encoding (RoPE). To address these issues, we introduce EQUIP, an EQUivariant preserving in-place token update mechanism that ensures the equivariance property of the operations performed in the attention computation. EQUIP offers two fundamental advantages: First, it combines eviction and a subsequent token insertion into an in-place replacement operation, which reduces the KV cache copy overheads significantly. Second, EQUIP reduces recomputation of rotation operations through a combination of in-place update, caching and a re-indexing strategy. Together, these optimizations enable EQUIP to achieve geomean speedups of 1.62× (or 1.47×) on CPU (GPU) over StreamingLLM, and 3.45× (or 1.86×) on CPU (GPU) over Heavy Hitters (H2O). EQUIP with Paged Attention achieves speedups of 4.18×(2.61×) on CPU (GPU) over auto-regressive baselines. EQUIP matches the model accuracy of baseline pruning methods while delivering superior performance.
%U https://aclanthology.org/2026.acl-long.1210/
%P 26303-26323
Markdown (Informal)
[EQUIP: EQUivariant preserving In-Place updates for Efficient Token Pruning](https://aclanthology.org/2026.acl-long.1210/) (Ramachandran et al., ACL 2026)
ACL
- Arun Ramachandran, R Govindarajan, Murali Annavaram, and Prakash Raghavendra. 2026. EQUIP: EQUivariant preserving In-Place updates for Efficient Token Pruning. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 26303–26323, San Diego, California, United States. Association for Computational Linguistics.