@inproceedings{moradi-etal-2026-flowhn,
title = "{F}low{HN}: Adaptive Token Routing for Efficient Parallel Hybrid Networks",
author = "Moradi, Mohammad Mahdi and
Ahmed, Walid and
Wen, Shuangyue and
Mudur, Sudhir and
Zhang, Weiwei and
Liu, Yang",
editor = "Li, Yunyao and
Rehm, Georg and
Tu, Mei",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-industry.83/",
pages = "1170--1181",
ISBN = "979-8-89176-394-4",
abstract = "Production LLMs must balance modeling quality with predictable latency, stable accelerator utilization, and cost-efficient scaling{---}constraints that remain difficult for existing architectures. Transformers provide strong reasoning but incur quadratic complexity, while state-space models (SSMs) scale efficiently yet lack fine-grained interactions; prior hybrids either introduce sequential bottlenecks or rely on learned routing that complicates deployment. We present FlowHN, a deployment-oriented parallel hybrid architecture that enables deterministic conditional computation via FLOP-aware token circulation across attention and SSM branches. Instead of dynamic expert routing, FlowHN performs hardware-aligned token scheduling that balances workloads, reduces synchronization stalls, and preserves full parameter utilization. Across 135M{--}1B models, FlowHN achieves up to 4{\texttimes} higher throughput and 15{\%} higher MFU than strong Transformer, SSM, and hybrid baselines while maintaining competitive accuracy on reasoning, coding, and long-context tasks up to 32K tokens. FlowHN is designed to integrate directly into existing Hybrid pipelines without changes to optimizers, training stacks, or inference serving infrastructure, making it practical for real-world deployment."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="moradi-etal-2026-flowhn">
<titleInfo>
<title>FlowHN: Adaptive Token Routing for Efficient Parallel Hybrid Networks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Mahdi</namePart>
<namePart type="family">Moradi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Walid</namePart>
<namePart type="family">Ahmed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuangyue</namePart>
<namePart type="family">Wen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sudhir</namePart>
<namePart type="family">Mudur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weiwei</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Rehm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mei</namePart>
<namePart type="family">Tu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-394-4</identifier>
</relatedItem>
<abstract>Production LLMs must balance modeling quality with predictable latency, stable accelerator utilization, and cost-efficient scaling—constraints that remain difficult for existing architectures. Transformers provide strong reasoning but incur quadratic complexity, while state-space models (SSMs) scale efficiently yet lack fine-grained interactions; prior hybrids either introduce sequential bottlenecks or rely on learned routing that complicates deployment. We present FlowHN, a deployment-oriented parallel hybrid architecture that enables deterministic conditional computation via FLOP-aware token circulation across attention and SSM branches. Instead of dynamic expert routing, FlowHN performs hardware-aligned token scheduling that balances workloads, reduces synchronization stalls, and preserves full parameter utilization. Across 135M–1B models, FlowHN achieves up to 4× higher throughput and 15% higher MFU than strong Transformer, SSM, and hybrid baselines while maintaining competitive accuracy on reasoning, coding, and long-context tasks up to 32K tokens. FlowHN is designed to integrate directly into existing Hybrid pipelines without changes to optimizers, training stacks, or inference serving infrastructure, making it practical for real-world deployment.</abstract>
<identifier type="citekey">moradi-etal-2026-flowhn</identifier>
<location>
<url>https://aclanthology.org/2026.acl-industry.83/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1170</start>
<end>1181</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FlowHN: Adaptive Token Routing for Efficient Parallel Hybrid Networks
%A Moradi, Mohammad Mahdi
%A Ahmed, Walid
%A Wen, Shuangyue
%A Mudur, Sudhir
%A Zhang, Weiwei
%A Liu, Yang
%Y Li, Yunyao
%Y Rehm, Georg
%Y Tu, Mei
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-394-4
%F moradi-etal-2026-flowhn
%X Production LLMs must balance modeling quality with predictable latency, stable accelerator utilization, and cost-efficient scaling—constraints that remain difficult for existing architectures. Transformers provide strong reasoning but incur quadratic complexity, while state-space models (SSMs) scale efficiently yet lack fine-grained interactions; prior hybrids either introduce sequential bottlenecks or rely on learned routing that complicates deployment. We present FlowHN, a deployment-oriented parallel hybrid architecture that enables deterministic conditional computation via FLOP-aware token circulation across attention and SSM branches. Instead of dynamic expert routing, FlowHN performs hardware-aligned token scheduling that balances workloads, reduces synchronization stalls, and preserves full parameter utilization. Across 135M–1B models, FlowHN achieves up to 4× higher throughput and 15% higher MFU than strong Transformer, SSM, and hybrid baselines while maintaining competitive accuracy on reasoning, coding, and long-context tasks up to 32K tokens. FlowHN is designed to integrate directly into existing Hybrid pipelines without changes to optimizers, training stacks, or inference serving infrastructure, making it practical for real-world deployment.
%U https://aclanthology.org/2026.acl-industry.83/
%P 1170-1181
Markdown (Informal)
[FlowHN: Adaptive Token Routing for Efficient Parallel Hybrid Networks](https://aclanthology.org/2026.acl-industry.83/) (Moradi et al., ACL 2026)
ACL
- Mohammad Mahdi Moradi, Walid Ahmed, Shuangyue Wen, Sudhir Mudur, Weiwei Zhang, and Yang Liu. 2026. FlowHN: Adaptive Token Routing for Efficient Parallel Hybrid Networks. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026), pages 1170–1181, San Diego, California, USA. Association for Computational Linguistics.