@inproceedings{he-etal-2026-accelerating,
title = "Accelerating Prefilling via Decoding-time Contribution Sparsity",
author = "He, Zhiyuan and
Zhang, Yike and
Zhang, Chengruidong and
Jiang, Huiqiang and
Yang, Yuqing and
Qiu, Lili",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.801/",
pages = "16296--16308",
ISBN = "979-8-89176-395-1",
abstract = "Large Language Models (LLMs) incur quadratic attention complexity with input length, creating a major time bottleneck in the prefilling stage. Existing acceleration methods largely exploit attention score sparsity by estimating blocks with high attention scores and applying dynamic sparse attention. In this work, we identify another untapped form of sparsity in the prefilling stage, namely decoding-time contribution sparsity, where many attention blocks exhibit nontrivial attention scores during prefilling yet contribute negligibly to subsequent decoding. Building on this observation, we propose TriangleMix, which replaces dense attention with Triangle attention in a subset of layers. Extensive experiments demonstrate that TriangleMix achieves near-lossless performance on both long-context and long-context reasoning benchmarks, while significantly improving efficiency. For 128K inputs, Triangle attention in the subset of layers achieves a $ 15.3 \times $ speedup in attention kernel computation, significantly exceeding the acceleration of typical dynamic sparse methods ($ 1.9 \times $ to $ 3.4 \times $). Furthermore, TriangleMix can be seamlessly combined with dynamic sparsity approaches, delivering an additional 6{\%}{--}19{\%} reduction in TTFT over using dynamic sparsity alone."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="he-etal-2026-accelerating">
<titleInfo>
<title>Accelerating Prefilling via Decoding-time Contribution Sparsity</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhiyuan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yike</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengruidong</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huiqiang</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuqing</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lili</namePart>
<namePart type="family">Qiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) incur quadratic attention complexity with input length, creating a major time bottleneck in the prefilling stage. Existing acceleration methods largely exploit attention score sparsity by estimating blocks with high attention scores and applying dynamic sparse attention. In this work, we identify another untapped form of sparsity in the prefilling stage, namely decoding-time contribution sparsity, where many attention blocks exhibit nontrivial attention scores during prefilling yet contribute negligibly to subsequent decoding. Building on this observation, we propose TriangleMix, which replaces dense attention with Triangle attention in a subset of layers. Extensive experiments demonstrate that TriangleMix achieves near-lossless performance on both long-context and long-context reasoning benchmarks, while significantly improving efficiency. For 128K inputs, Triangle attention in the subset of layers achieves a 15.3 \times speedup in attention kernel computation, significantly exceeding the acceleration of typical dynamic sparse methods ( 1.9 \times to 3.4 \times ). Furthermore, TriangleMix can be seamlessly combined with dynamic sparsity approaches, delivering an additional 6%–19% reduction in TTFT over using dynamic sparsity alone.</abstract>
<identifier type="citekey">he-etal-2026-accelerating</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.801/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>16296</start>
<end>16308</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Accelerating Prefilling via Decoding-time Contribution Sparsity
%A He, Zhiyuan
%A Zhang, Yike
%A Zhang, Chengruidong
%A Jiang, Huiqiang
%A Yang, Yuqing
%A Qiu, Lili
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F he-etal-2026-accelerating
%X Large Language Models (LLMs) incur quadratic attention complexity with input length, creating a major time bottleneck in the prefilling stage. Existing acceleration methods largely exploit attention score sparsity by estimating blocks with high attention scores and applying dynamic sparse attention. In this work, we identify another untapped form of sparsity in the prefilling stage, namely decoding-time contribution sparsity, where many attention blocks exhibit nontrivial attention scores during prefilling yet contribute negligibly to subsequent decoding. Building on this observation, we propose TriangleMix, which replaces dense attention with Triangle attention in a subset of layers. Extensive experiments demonstrate that TriangleMix achieves near-lossless performance on both long-context and long-context reasoning benchmarks, while significantly improving efficiency. For 128K inputs, Triangle attention in the subset of layers achieves a 15.3 \times speedup in attention kernel computation, significantly exceeding the acceleration of typical dynamic sparse methods ( 1.9 \times to 3.4 \times ). Furthermore, TriangleMix can be seamlessly combined with dynamic sparsity approaches, delivering an additional 6%–19% reduction in TTFT over using dynamic sparsity alone.
%U https://aclanthology.org/2026.findings-acl.801/
%P 16296-16308
Markdown (Informal)
[Accelerating Prefilling via Decoding-time Contribution Sparsity](https://aclanthology.org/2026.findings-acl.801/) (He et al., Findings 2026)
ACL
- Zhiyuan He, Yike Zhang, Chengruidong Zhang, Huiqiang Jiang, Yuqing Yang, and Lili Qiu. 2026. Accelerating Prefilling via Decoding-time Contribution Sparsity. In Findings of the Association for Computational Linguistics: ACL 2026, pages 16296–16308, San Diego, California, United States. Association for Computational Linguistics.