@inproceedings{wang-etal-2026-routemoa,
title = "{R}oute{M}o{A}: Dynamic Routing without Pre-Inference Boosts Efficient Mixture-of-Agents",
author = "Wang, Jize and
Wu, Han and
You, Zhiyuan and
Song, Yiming and
Wang, Yijun and
Shan, Zifei and
Li, Yining and
Zhang, Songyang and
Le, Xinyi and
Chen, Cailian and
Guan, Xinping and
Tao, Dacheng",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.558/",
pages = "12190--12208",
ISBN = "979-8-89176-390-6",
abstract = "Mixture-of-Agents (MoA) improves LLM performance through layered collaboration, but its dense topology raises costs and latency. Existing methods employ LLM judges to filter responses, yet still require all models to perform inference before judging, failing to cut costs effectively. They also lack model selection criteria and struggle with large model pools, where full inference is costly and can exceed context limits. To address this, we propose **RouteMoA**, an efficient mixture-of-agents framework with dynamic routing. It employs a lightweight *scorer* to perform initial screening by predicting coarse-grained performance from the query, narrowing candidates to a high-potential subset without inference. A *mixture of judges* then refines these scores through lightweight self- and cross-assessment based on existing model outputs, providing posterior correction without additional inference. Finally, a *model ranking* mechanism selects models by balancing performance, cost, and latency. RouteMoA outperforms MoA across varying tasks and model pool sizes, reducing cost by 89.8{\%} and latency by 63.6{\%} in the large-scale model pool. Code is available at https://github.com/Jize-W/RouteMoA."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2026-routemoa">
<titleInfo>
<title>RouteMoA: Dynamic Routing without Pre-Inference Boosts Efficient Mixture-of-Agents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jize</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Han</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyuan</namePart>
<namePart type="family">You</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yiming</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yijun</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zifei</namePart>
<namePart type="family">Shan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yining</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Songyang</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinyi</namePart>
<namePart type="family">Le</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cailian</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinping</namePart>
<namePart type="family">Guan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dacheng</namePart>
<namePart type="family">Tao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Mixture-of-Agents (MoA) improves LLM performance through layered collaboration, but its dense topology raises costs and latency. Existing methods employ LLM judges to filter responses, yet still require all models to perform inference before judging, failing to cut costs effectively. They also lack model selection criteria and struggle with large model pools, where full inference is costly and can exceed context limits. To address this, we propose **RouteMoA**, an efficient mixture-of-agents framework with dynamic routing. It employs a lightweight *scorer* to perform initial screening by predicting coarse-grained performance from the query, narrowing candidates to a high-potential subset without inference. A *mixture of judges* then refines these scores through lightweight self- and cross-assessment based on existing model outputs, providing posterior correction without additional inference. Finally, a *model ranking* mechanism selects models by balancing performance, cost, and latency. RouteMoA outperforms MoA across varying tasks and model pool sizes, reducing cost by 89.8% and latency by 63.6% in the large-scale model pool. Code is available at https://github.com/Jize-W/RouteMoA.</abstract>
<identifier type="citekey">wang-etal-2026-routemoa</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.558/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>12190</start>
<end>12208</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T RouteMoA: Dynamic Routing without Pre-Inference Boosts Efficient Mixture-of-Agents
%A Wang, Jize
%A Wu, Han
%A You, Zhiyuan
%A Song, Yiming
%A Wang, Yijun
%A Shan, Zifei
%A Li, Yining
%A Zhang, Songyang
%A Le, Xinyi
%A Chen, Cailian
%A Guan, Xinping
%A Tao, Dacheng
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F wang-etal-2026-routemoa
%X Mixture-of-Agents (MoA) improves LLM performance through layered collaboration, but its dense topology raises costs and latency. Existing methods employ LLM judges to filter responses, yet still require all models to perform inference before judging, failing to cut costs effectively. They also lack model selection criteria and struggle with large model pools, where full inference is costly and can exceed context limits. To address this, we propose **RouteMoA**, an efficient mixture-of-agents framework with dynamic routing. It employs a lightweight *scorer* to perform initial screening by predicting coarse-grained performance from the query, narrowing candidates to a high-potential subset without inference. A *mixture of judges* then refines these scores through lightweight self- and cross-assessment based on existing model outputs, providing posterior correction without additional inference. Finally, a *model ranking* mechanism selects models by balancing performance, cost, and latency. RouteMoA outperforms MoA across varying tasks and model pool sizes, reducing cost by 89.8% and latency by 63.6% in the large-scale model pool. Code is available at https://github.com/Jize-W/RouteMoA.
%U https://aclanthology.org/2026.acl-long.558/
%P 12190-12208
Markdown (Informal)
[RouteMoA: Dynamic Routing without Pre-Inference Boosts Efficient Mixture-of-Agents](https://aclanthology.org/2026.acl-long.558/) (Wang et al., ACL 2026)
ACL
- Jize Wang, Han Wu, Zhiyuan You, Yiming Song, Yijun Wang, Zifei Shan, Yining Li, Songyang Zhang, Xinyi Le, Cailian Chen, Xinping Guan, and Dacheng Tao. 2026. RouteMoA: Dynamic Routing without Pre-Inference Boosts Efficient Mixture-of-Agents. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 12190–12208, San Diego, California, United States. Association for Computational Linguistics.