@inproceedings{zhang-etal-2026-s2o,
title = "{S}2{O}: Early Stopping for Sparse Attention via Online Permutation",
author = "Zhang, Yu and
Liu, Songwei and
Yan, Chenqian and
Linsheng and
Ning, Beichen and
Chen, Fangmin and
Wang, Xing",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.351/",
pages = "7737--7751",
ISBN = "979-8-89176-390-6",
abstract = "Attention scales quadratically with sequence length, fundamentally limiting long-context inference.Existing block-granularity sparsification can reduce latency, but coarse blocks impose an intrinsic sparsity ceiling, making further improvements difficult even with carefully engineered designs.We present \textbf{S2O}, which performs early \textbf{s}topping for \textbf{s}parse attention via \textbf{o}nline permutation.Inspired by virtual-to-physical address mapping in memory systems, S2O revisits and factorizes FlashAttention execution, enabling inference to load non-contiguous tokens rather than a contiguous span in the original order.Motivated by fine-grained structures in attention heatmaps, we transform explicit permutation into an online, index-guided, discrete loading policy; with extremely lightweight preprocessing and index-remapping overhead, it concentrates importance on a small set of high-priority blocks.Building on this importance-guided online permutation for loading, S2O further introduces an early-stopping rule: computation proceeds from high to low importance; once the current block score falls below a threshold, S2O terminates early and skips the remaining low-contribution blocks, thereby increasing effective sparsity and reducing computation under a controlled error budget.As a result, S2O substantially raises the practical sparsity ceiling.On \textbf{Llama-3.1-8B} under a \textbf{128K} context, S2O reduces single-operator MSE by \textbf{3.82$\times$} at matched sparsity, and reduces prefill compute density by \textbf{3.31$\times$} at matched MSE; meanwhile, it preserves end-to-end accuracy and achieves \textbf{7.51$\times$} attention and \textbf{3.81$\times$} end-to-end speedups."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2026-s2o">
<titleInfo>
<title>S2O: Early Stopping for Sparse Attention via Online Permutation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Songwei</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenqian</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name>
<namePart>Linsheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Beichen</namePart>
<namePart type="family">Ning</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fangmin</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xing</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Attention scales quadratically with sequence length, fundamentally limiting long-context inference.Existing block-granularity sparsification can reduce latency, but coarse blocks impose an intrinsic sparsity ceiling, making further improvements difficult even with carefully engineered designs.We present S2O, which performs early stopping for sparse attention via online permutation.Inspired by virtual-to-physical address mapping in memory systems, S2O revisits and factorizes FlashAttention execution, enabling inference to load non-contiguous tokens rather than a contiguous span in the original order.Motivated by fine-grained structures in attention heatmaps, we transform explicit permutation into an online, index-guided, discrete loading policy; with extremely lightweight preprocessing and index-remapping overhead, it concentrates importance on a small set of high-priority blocks.Building on this importance-guided online permutation for loading, S2O further introduces an early-stopping rule: computation proceeds from high to low importance; once the current block score falls below a threshold, S2O terminates early and skips the remaining low-contribution blocks, thereby increasing effective sparsity and reducing computation under a controlled error budget.As a result, S2O substantially raises the practical sparsity ceiling.On Llama-3.1-8B under a 128K context, S2O reduces single-operator MSE by 3.82\times at matched sparsity, and reduces prefill compute density by 3.31\times at matched MSE; meanwhile, it preserves end-to-end accuracy and achieves 7.51\times attention and 3.81\times end-to-end speedups.</abstract>
<identifier type="citekey">zhang-etal-2026-s2o</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.351/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>7737</start>
<end>7751</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T S2O: Early Stopping for Sparse Attention via Online Permutation
%A Zhang, Yu
%A Liu, Songwei
%A Yan, Chenqian
%A Ning, Beichen
%A Chen, Fangmin
%A Wang, Xing
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%A Linsheng
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F zhang-etal-2026-s2o
%X Attention scales quadratically with sequence length, fundamentally limiting long-context inference.Existing block-granularity sparsification can reduce latency, but coarse blocks impose an intrinsic sparsity ceiling, making further improvements difficult even with carefully engineered designs.We present S2O, which performs early stopping for sparse attention via online permutation.Inspired by virtual-to-physical address mapping in memory systems, S2O revisits and factorizes FlashAttention execution, enabling inference to load non-contiguous tokens rather than a contiguous span in the original order.Motivated by fine-grained structures in attention heatmaps, we transform explicit permutation into an online, index-guided, discrete loading policy; with extremely lightweight preprocessing and index-remapping overhead, it concentrates importance on a small set of high-priority blocks.Building on this importance-guided online permutation for loading, S2O further introduces an early-stopping rule: computation proceeds from high to low importance; once the current block score falls below a threshold, S2O terminates early and skips the remaining low-contribution blocks, thereby increasing effective sparsity and reducing computation under a controlled error budget.As a result, S2O substantially raises the practical sparsity ceiling.On Llama-3.1-8B under a 128K context, S2O reduces single-operator MSE by 3.82\times at matched sparsity, and reduces prefill compute density by 3.31\times at matched MSE; meanwhile, it preserves end-to-end accuracy and achieves 7.51\times attention and 3.81\times end-to-end speedups.
%U https://aclanthology.org/2026.acl-long.351/
%P 7737-7751
Markdown (Informal)
[S2O: Early Stopping for Sparse Attention via Online Permutation](https://aclanthology.org/2026.acl-long.351/) (Zhang et al., ACL 2026)
ACL
- Yu Zhang, Songwei Liu, Chenqian Yan, Linsheng, Beichen Ning, Fangmin Chen, and Xing Wang. 2026. S2O: Early Stopping for Sparse Attention via Online Permutation. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 7737–7751, San Diego, California, United States. Association for Computational Linguistics.