@inproceedings{peng-etal-2025-firewall,
title = "Firewall Routing: Blocking Leads to Better Hybrid Inference for {LLM}s",
author = "Peng, Runyu and
Zhou, Yunhua and
Lv, Kai and
Gao, Yang and
Guo, Qipeng and
Qiu, Xipeng",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.331/",
pages = "6540--6565",
ISBN = "979-8-89176-332-6",
abstract = "The rapid advancement of Large Language Models (LLMs) has significantly enhanced performance across various natural language processing (NLP) tasks, yet the high computational costs and latency associated with deploying such models continue to pose critical bottlenecks, limiting their broader applicability. To mitigate these challenges, we propose a dynamic hybrid inference framework, Firewall Routing, which efficiently selects between a strong and a weak LLMs based on the complexity of the query. A lightweight routing model is trained to optimize resource allocation by learning from response quality and preventing long-tail queries, which are often too hard to solve by LLMs, from being routed to the stronger model. Moreover, our method incorporates multiple sampling to enhance query evaluation reliability while leveraging Hard Blocking and Soft Blocking to handle long-tail queries along with refining labels for model selection. Extensive experiments show our method outperforms existing routing strategies by up to 5.29{\%} in APGR, demonstrating state-of-the-art performance across multiple benchmarks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="peng-etal-2025-firewall">
<titleInfo>
<title>Firewall Routing: Blocking Leads to Better Hybrid Inference for LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Runyu</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunhua</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">Lv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qipeng</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xipeng</namePart>
<namePart type="family">Qiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>The rapid advancement of Large Language Models (LLMs) has significantly enhanced performance across various natural language processing (NLP) tasks, yet the high computational costs and latency associated with deploying such models continue to pose critical bottlenecks, limiting their broader applicability. To mitigate these challenges, we propose a dynamic hybrid inference framework, Firewall Routing, which efficiently selects between a strong and a weak LLMs based on the complexity of the query. A lightweight routing model is trained to optimize resource allocation by learning from response quality and preventing long-tail queries, which are often too hard to solve by LLMs, from being routed to the stronger model. Moreover, our method incorporates multiple sampling to enhance query evaluation reliability while leveraging Hard Blocking and Soft Blocking to handle long-tail queries along with refining labels for model selection. Extensive experiments show our method outperforms existing routing strategies by up to 5.29% in APGR, demonstrating state-of-the-art performance across multiple benchmarks.</abstract>
<identifier type="citekey">peng-etal-2025-firewall</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.331/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>6540</start>
<end>6565</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Firewall Routing: Blocking Leads to Better Hybrid Inference for LLMs
%A Peng, Runyu
%A Zhou, Yunhua
%A Lv, Kai
%A Gao, Yang
%A Guo, Qipeng
%A Qiu, Xipeng
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F peng-etal-2025-firewall
%X The rapid advancement of Large Language Models (LLMs) has significantly enhanced performance across various natural language processing (NLP) tasks, yet the high computational costs and latency associated with deploying such models continue to pose critical bottlenecks, limiting their broader applicability. To mitigate these challenges, we propose a dynamic hybrid inference framework, Firewall Routing, which efficiently selects between a strong and a weak LLMs based on the complexity of the query. A lightweight routing model is trained to optimize resource allocation by learning from response quality and preventing long-tail queries, which are often too hard to solve by LLMs, from being routed to the stronger model. Moreover, our method incorporates multiple sampling to enhance query evaluation reliability while leveraging Hard Blocking and Soft Blocking to handle long-tail queries along with refining labels for model selection. Extensive experiments show our method outperforms existing routing strategies by up to 5.29% in APGR, demonstrating state-of-the-art performance across multiple benchmarks.
%U https://aclanthology.org/2025.emnlp-main.331/
%P 6540-6565
Markdown (Informal)
[Firewall Routing: Blocking Leads to Better Hybrid Inference for LLMs](https://aclanthology.org/2025.emnlp-main.331/) (Peng et al., EMNLP 2025)
ACL