@inproceedings{shah-shridhar-2025-select,
title = "Select-then-Route : Taxonomy guided Routing for {LLM}s",
author = "Shah, Soham and
Shridhar, Kumar",
editor = "Potdar, Saloni and
Rojas-Barahona, Lina and
Montella, Sebastien",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2025",
address = "Suzhou (China)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-industry.28/",
pages = "425--441",
ISBN = "979-8-89176-333-3",
abstract = "Recent advances in large language models (LLMs) have boosted performance across a broad spectrum of natural{-}language tasks, yet no single model excels uniformly across domains. Sending each query to the most suitable model mitigates this limitation, but deciding among *all* available LLMs for each query is prohibitively expensive. Both the accuracy and the latency can improve if the decision space for the model choice is first narrowed, followed by selecting the suitable model for the given query.We introduce Select-then-Route (StR), a two{-}stage framework that first *selects* a small, task{-}appropriate pool of LLMs and then *routes* each query within that pool through an adaptive cascade. StR first employs a lightweight, *taxonomy{-}guided selector* that maps each query to models proven proficient for its semantic class (e.g., reasoning, code, summarisation). Within the selected pool, a *confidence{-}based cascade* begins with the cheapest model and escalates only when a multi{-}judge agreement test signals low reliability.Across six public benchmarks of various domains, StR improves the end{-}to{-}end accuracy from 91.7{\%} (best single model) to 94.3{\%} while reducing inference cost by 4X. Because both the taxonomy and multi-judge evaluation thresholds are tunable, StR exposes a smooth cost{--}accuracy frontier, enabling users to dial in the trade{-}off that best fits their latency and budget constraints."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shah-shridhar-2025-select">
<titleInfo>
<title>Select-then-Route : Taxonomy guided Routing for LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Soham</namePart>
<namePart type="family">Shah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kumar</namePart>
<namePart type="family">Shridhar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saloni</namePart>
<namePart type="family">Potdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lina</namePart>
<namePart type="family">Rojas-Barahona</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastien</namePart>
<namePart type="family">Montella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou (China)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-333-3</identifier>
</relatedItem>
<abstract>Recent advances in large language models (LLMs) have boosted performance across a broad spectrum of natural-language tasks, yet no single model excels uniformly across domains. Sending each query to the most suitable model mitigates this limitation, but deciding among *all* available LLMs for each query is prohibitively expensive. Both the accuracy and the latency can improve if the decision space for the model choice is first narrowed, followed by selecting the suitable model for the given query.We introduce Select-then-Route (StR), a two-stage framework that first *selects* a small, task-appropriate pool of LLMs and then *routes* each query within that pool through an adaptive cascade. StR first employs a lightweight, *taxonomy-guided selector* that maps each query to models proven proficient for its semantic class (e.g., reasoning, code, summarisation). Within the selected pool, a *confidence-based cascade* begins with the cheapest model and escalates only when a multi-judge agreement test signals low reliability.Across six public benchmarks of various domains, StR improves the end-to-end accuracy from 91.7% (best single model) to 94.3% while reducing inference cost by 4X. Because both the taxonomy and multi-judge evaluation thresholds are tunable, StR exposes a smooth cost–accuracy frontier, enabling users to dial in the trade-off that best fits their latency and budget constraints.</abstract>
<identifier type="citekey">shah-shridhar-2025-select</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-industry.28/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>425</start>
<end>441</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Select-then-Route : Taxonomy guided Routing for LLMs
%A Shah, Soham
%A Shridhar, Kumar
%Y Potdar, Saloni
%Y Rojas-Barahona, Lina
%Y Montella, Sebastien
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou (China)
%@ 979-8-89176-333-3
%F shah-shridhar-2025-select
%X Recent advances in large language models (LLMs) have boosted performance across a broad spectrum of natural-language tasks, yet no single model excels uniformly across domains. Sending each query to the most suitable model mitigates this limitation, but deciding among *all* available LLMs for each query is prohibitively expensive. Both the accuracy and the latency can improve if the decision space for the model choice is first narrowed, followed by selecting the suitable model for the given query.We introduce Select-then-Route (StR), a two-stage framework that first *selects* a small, task-appropriate pool of LLMs and then *routes* each query within that pool through an adaptive cascade. StR first employs a lightweight, *taxonomy-guided selector* that maps each query to models proven proficient for its semantic class (e.g., reasoning, code, summarisation). Within the selected pool, a *confidence-based cascade* begins with the cheapest model and escalates only when a multi-judge agreement test signals low reliability.Across six public benchmarks of various domains, StR improves the end-to-end accuracy from 91.7% (best single model) to 94.3% while reducing inference cost by 4X. Because both the taxonomy and multi-judge evaluation thresholds are tunable, StR exposes a smooth cost–accuracy frontier, enabling users to dial in the trade-off that best fits their latency and budget constraints.
%U https://aclanthology.org/2025.emnlp-industry.28/
%P 425-441
Markdown (Informal)
[Select-then-Route : Taxonomy guided Routing for LLMs](https://aclanthology.org/2025.emnlp-industry.28/) (Shah & Shridhar, EMNLP 2025)
ACL
- Soham Shah and Kumar Shridhar. 2025. Select-then-Route : Taxonomy guided Routing for LLMs. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 425–441, Suzhou (China). Association for Computational Linguistics.