@inproceedings{zhao-zhu-2025-skyllm,
title = "{S}ky{LLM}: Cross-{LLM}-{API}s Federation for Cost-effective Query Processing",
author = "Zhao, Heng and
Zhu, Yifei",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1073/",
doi = "10.18653/v1/2025.findings-acl.1073",
pages = "20864--20873",
ISBN = "979-8-89176-256-5",
abstract = "Large language models (LLMs) have demonstrated exceptional capabilities across a wide range of tasks, from text generation to complex problem-solving. LLM APIs provide easy access to these models by streamlining deployment and usage. Combining LLMs with complementary strengths has been shown to yield substantial performance gains over a monolithic LLM. However, invoking a fixed set of LLM APIs for each query incurs higher API costs and increased inference latency. To address these limitations, we propose SkyLLM, a system composed of a set of estimators and an API selector, which federates multiple LLM APIs and dynamically assigns a non-empty subset of these APIs to each query prior to inference under cost and latency budgets. The selected subset consists of either a single LLM or multiple LLMs. A single LLM efficiently handles simple queries at low cost, whereas multiple LLMs are employed for more complex queries to overcome performance limitations. We evaluate SkyLLM against individual LLMs and representative ensemble LLM methods from the literature. SkyLLM achieves the highest accuracy under a high budget. It can also be cost-effective, matching the most accurate individual LLM while cutting costs by 67.8{\%}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhao-zhu-2025-skyllm">
<titleInfo>
<title>SkyLLM: Cross-LLM-APIs Federation for Cost-effective Query Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yifei</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Large language models (LLMs) have demonstrated exceptional capabilities across a wide range of tasks, from text generation to complex problem-solving. LLM APIs provide easy access to these models by streamlining deployment and usage. Combining LLMs with complementary strengths has been shown to yield substantial performance gains over a monolithic LLM. However, invoking a fixed set of LLM APIs for each query incurs higher API costs and increased inference latency. To address these limitations, we propose SkyLLM, a system composed of a set of estimators and an API selector, which federates multiple LLM APIs and dynamically assigns a non-empty subset of these APIs to each query prior to inference under cost and latency budgets. The selected subset consists of either a single LLM or multiple LLMs. A single LLM efficiently handles simple queries at low cost, whereas multiple LLMs are employed for more complex queries to overcome performance limitations. We evaluate SkyLLM against individual LLMs and representative ensemble LLM methods from the literature. SkyLLM achieves the highest accuracy under a high budget. It can also be cost-effective, matching the most accurate individual LLM while cutting costs by 67.8%.</abstract>
<identifier type="citekey">zhao-zhu-2025-skyllm</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1073</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1073/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>20864</start>
<end>20873</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SkyLLM: Cross-LLM-APIs Federation for Cost-effective Query Processing
%A Zhao, Heng
%A Zhu, Yifei
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F zhao-zhu-2025-skyllm
%X Large language models (LLMs) have demonstrated exceptional capabilities across a wide range of tasks, from text generation to complex problem-solving. LLM APIs provide easy access to these models by streamlining deployment and usage. Combining LLMs with complementary strengths has been shown to yield substantial performance gains over a monolithic LLM. However, invoking a fixed set of LLM APIs for each query incurs higher API costs and increased inference latency. To address these limitations, we propose SkyLLM, a system composed of a set of estimators and an API selector, which federates multiple LLM APIs and dynamically assigns a non-empty subset of these APIs to each query prior to inference under cost and latency budgets. The selected subset consists of either a single LLM or multiple LLMs. A single LLM efficiently handles simple queries at low cost, whereas multiple LLMs are employed for more complex queries to overcome performance limitations. We evaluate SkyLLM against individual LLMs and representative ensemble LLM methods from the literature. SkyLLM achieves the highest accuracy under a high budget. It can also be cost-effective, matching the most accurate individual LLM while cutting costs by 67.8%.
%R 10.18653/v1/2025.findings-acl.1073
%U https://aclanthology.org/2025.findings-acl.1073/
%U https://doi.org/10.18653/v1/2025.findings-acl.1073
%P 20864-20873
Markdown (Informal)
[SkyLLM: Cross-LLM-APIs Federation for Cost-effective Query Processing](https://aclanthology.org/2025.findings-acl.1073/) (Zhao & Zhu, Findings 2025)
ACL