@inproceedings{liang-etal-2025-conv,
title = "Conv-Basis: A New Paradigm for Efficient Attention Inference and Gradient Computation in Transformers",
author = "Liang, Yingyu and
Liu, Heshan and
Shi, Zhenmei and
Song, Zhao and
Xu, Zhuoyan and
Zhao, Jiale and
Zhuang, Zhen",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.363/",
doi = "10.18653/v1/2025.findings-emnlp.363",
pages = "6857--6894",
ISBN = "979-8-89176-335-7",
abstract = "The self-attention mechanism is key to the success of transformers in recent large language models (LLMs). However, the quadratic computational cost, $O(n^2)$, with respect to the input sequence length $n$ poses a significant obstacle to further improvement and scalability in longer contexts.In this work, we leverage the convolution-like structure of attention matrices to develop an efficient approximation method for attention computation using convolution matrices. We propose a $\mathsf{conv}$ basis system, analogous to the rank basis, and show that any lower triangular matrix can be decomposed as a sum of structured convolution matrices in this basis. We then design a fast algorithm to approximate the attention matrix using a sum of $k$ convolution matrices. This enables us to compute attention during \textit{inference} via Fast Fourier Transforms (FFT) in $O(knd \log n)$ time, where $d$ is the hidden dimension, achieving nearly linear time complexity, $n^{1+o(1)}$, in practical scenarios where $kd = n^{o(1)}$. Furthermore, both \textit{training forward} and \textit{backward gradient} computations can be performed in $n^{1+o(1)}$ time as well.We provide theoretical guarantees on runtime and approximation error and conduct preliminary experiments to evaluate the effectiveness of our approach. We hope this new paradigm for accelerating attention computation in transformer models facilitates their application to longer contexts."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liang-etal-2025-conv">
<titleInfo>
<title>Conv-Basis: A New Paradigm for Efficient Attention Inference and Gradient Computation in Transformers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yingyu</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heshan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhenmei</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhao</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuoyan</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiale</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhen</namePart>
<namePart type="family">Zhuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>The self-attention mechanism is key to the success of transformers in recent large language models (LLMs). However, the quadratic computational cost, O(n²), with respect to the input sequence length n poses a significant obstacle to further improvement and scalability in longer contexts.In this work, we leverage the convolution-like structure of attention matrices to develop an efficient approximation method for attention computation using convolution matrices. We propose a \mathsfconv basis system, analogous to the rank basis, and show that any lower triangular matrix can be decomposed as a sum of structured convolution matrices in this basis. We then design a fast algorithm to approximate the attention matrix using a sum of k convolution matrices. This enables us to compute attention during inference via Fast Fourier Transforms (FFT) in O(knd łog n) time, where d is the hidden dimension, achieving nearly linear time complexity, n¹+o(1), in practical scenarios where kd = n^o(1). Furthermore, both training forward and backward gradient computations can be performed in n¹+o(1) time as well.We provide theoretical guarantees on runtime and approximation error and conduct preliminary experiments to evaluate the effectiveness of our approach. We hope this new paradigm for accelerating attention computation in transformer models facilitates their application to longer contexts.</abstract>
<identifier type="citekey">liang-etal-2025-conv</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.363</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.363/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>6857</start>
<end>6894</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Conv-Basis: A New Paradigm for Efficient Attention Inference and Gradient Computation in Transformers
%A Liang, Yingyu
%A Liu, Heshan
%A Shi, Zhenmei
%A Song, Zhao
%A Xu, Zhuoyan
%A Zhao, Jiale
%A Zhuang, Zhen
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F liang-etal-2025-conv
%X The self-attention mechanism is key to the success of transformers in recent large language models (LLMs). However, the quadratic computational cost, O(n²), with respect to the input sequence length n poses a significant obstacle to further improvement and scalability in longer contexts.In this work, we leverage the convolution-like structure of attention matrices to develop an efficient approximation method for attention computation using convolution matrices. We propose a \mathsfconv basis system, analogous to the rank basis, and show that any lower triangular matrix can be decomposed as a sum of structured convolution matrices in this basis. We then design a fast algorithm to approximate the attention matrix using a sum of k convolution matrices. This enables us to compute attention during inference via Fast Fourier Transforms (FFT) in O(knd łog n) time, where d is the hidden dimension, achieving nearly linear time complexity, n¹+o(1), in practical scenarios where kd = n^o(1). Furthermore, both training forward and backward gradient computations can be performed in n¹+o(1) time as well.We provide theoretical guarantees on runtime and approximation error and conduct preliminary experiments to evaluate the effectiveness of our approach. We hope this new paradigm for accelerating attention computation in transformer models facilitates their application to longer contexts.
%R 10.18653/v1/2025.findings-emnlp.363
%U https://aclanthology.org/2025.findings-emnlp.363/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.363
%P 6857-6894
Markdown (Informal)
[Conv-Basis: A New Paradigm for Efficient Attention Inference and Gradient Computation in Transformers](https://aclanthology.org/2025.findings-emnlp.363/) (Liang et al., Findings 2025)
ACL