@inproceedings{fichtl-etal-2026-challenging,
title = "Challenging Quadratic Attention - A Holistic View On the Rise of Alternative Language Model Architectures",
author = "Fichtl, Alexander M. and
Bohn, Jeremias and
Kelber, Josefin and
Mosca, Edoardo and
Groh, Georg",
editor = "Elazar, Yanai and
Ettinger, Allyson and
Kassner, Nora and
Ruder, Sebastian",
booktitle = "Proceedings of The Big Picture v2: Crafting a Research Narrative",
month = jul,
year = "2026",
address = "San Diego, CA, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.bigpicture-main.6/",
doi = "10.18653/v1/2026.bigpicture-main.6",
pages = "60--81",
ISBN = "979-8-89176-416-3",
abstract = "Transformers have dominated sequence processing tasks for the past seven years{---}most notably language modeling. However, the inherent quadratic complexity of their attention mechanism remains a significant bottleneck as context length increases. We review and distill the recent efforts to overcome this bottleneck, including advances in (sub-quadratic) attention variants, recurrent neural networks, state space models, and hybrid architectures. We critically analyze approaches regarding compute and memory complexity, benchmark results, and fundamental limitations to assess whether the dominance of pure-attention transformers may soon be challenged, which we consider possible, particularly in domain-specific and edge-device applications."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fichtl-etal-2026-challenging">
<titleInfo>
<title>Challenging Quadratic Attention - A Holistic View On the Rise of Alternative Language Model Architectures</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Fichtl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jeremias</namePart>
<namePart type="family">Bohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Josefin</namePart>
<namePart type="family">Kelber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edoardo</namePart>
<namePart type="family">Mosca</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Groh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of The Big Picture v2: Crafting a Research Narrative</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yanai</namePart>
<namePart type="family">Elazar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Allyson</namePart>
<namePart type="family">Ettinger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nora</namePart>
<namePart type="family">Kassner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Ruder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, CA, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-416-3</identifier>
</relatedItem>
<abstract>Transformers have dominated sequence processing tasks for the past seven years—most notably language modeling. However, the inherent quadratic complexity of their attention mechanism remains a significant bottleneck as context length increases. We review and distill the recent efforts to overcome this bottleneck, including advances in (sub-quadratic) attention variants, recurrent neural networks, state space models, and hybrid architectures. We critically analyze approaches regarding compute and memory complexity, benchmark results, and fundamental limitations to assess whether the dominance of pure-attention transformers may soon be challenged, which we consider possible, particularly in domain-specific and edge-device applications.</abstract>
<identifier type="citekey">fichtl-etal-2026-challenging</identifier>
<identifier type="doi">10.18653/v1/2026.bigpicture-main.6</identifier>
<location>
<url>https://aclanthology.org/2026.bigpicture-main.6/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>60</start>
<end>81</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Challenging Quadratic Attention - A Holistic View On the Rise of Alternative Language Model Architectures
%A Fichtl, Alexander M.
%A Bohn, Jeremias
%A Kelber, Josefin
%A Mosca, Edoardo
%A Groh, Georg
%Y Elazar, Yanai
%Y Ettinger, Allyson
%Y Kassner, Nora
%Y Ruder, Sebastian
%S Proceedings of The Big Picture v2: Crafting a Research Narrative
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, CA, USA
%@ 979-8-89176-416-3
%F fichtl-etal-2026-challenging
%X Transformers have dominated sequence processing tasks for the past seven years—most notably language modeling. However, the inherent quadratic complexity of their attention mechanism remains a significant bottleneck as context length increases. We review and distill the recent efforts to overcome this bottleneck, including advances in (sub-quadratic) attention variants, recurrent neural networks, state space models, and hybrid architectures. We critically analyze approaches regarding compute and memory complexity, benchmark results, and fundamental limitations to assess whether the dominance of pure-attention transformers may soon be challenged, which we consider possible, particularly in domain-specific and edge-device applications.
%R 10.18653/v1/2026.bigpicture-main.6
%U https://aclanthology.org/2026.bigpicture-main.6/
%U https://doi.org/10.18653/v1/2026.bigpicture-main.6
%P 60-81
Markdown (Informal)
[Challenging Quadratic Attention - A Holistic View On the Rise of Alternative Language Model Architectures](https://aclanthology.org/2026.bigpicture-main.6/) (Fichtl et al., BigPicture 2026)
ACL