@inproceedings{elder-etal-2026-live,
title = "Live {API}-Bench: 2500+ Live {API}s for Testing Multi-Step Tool Calling",
author = "Elder, Benjamin and
Murthi, Anupama and
Kang, Jungkoo and
Naik, Ankita and
Basu, Kinjal and
Kate, Kiran and
Contractor, Danish",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.143/",
pages = "3092--3124",
ISBN = "979-8-89176-380-7",
abstract = "Large language models (LLMs) increasingly rely on external tools and APIs to execute complex tasks specified in natural language. Evaluating such \textit{}tool-calling capabilities in realistic enterprise settings is challenging: APIs are often proprietary, heterogeneous, and difficult to share, limiting reproducible benchmarks. To address this, we introduce \textbf{}Live API Bench, a comprehensive benchmark constructed by transforming NL2SQL datasets into interactive API environments. Our pipeline converts SQL queries from BIRD-SQL into executable API sequences across three formulations{---}\textit{}SLOT, \textit{}SEL, and \textit{}REST{---}covering minimal general-purpose operations, domain-specific multi-step tasks, and function-oriented RESTful interactions, respectively. The benchmark spans 11 databases with over 2,500 invocable tools, paired with human-authored queries, ground-truth API sequences, and verified final answers. Live API Bench enables systematic evaluation of core challenges in tool use, including error handling, sequential reasoning, parameter generation, response parsing, and robustness across diverse domains. We evaluate 10 LLMs and 4 ReACT agents, observing low task completion rates (7{--}47{\%}), which improve modestly to 50{\%} under interactive agent settings, highlighting substantial scope for improving LLM tool-calling performance. We release all code and data associated with this paper."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="elder-etal-2026-live">
<titleInfo>
<title>Live API-Bench: 2500+ Live APIs for Testing Multi-Step Tool Calling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Elder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anupama</namePart>
<namePart type="family">Murthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jungkoo</namePart>
<namePart type="family">Kang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ankita</namePart>
<namePart type="family">Naik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kinjal</namePart>
<namePart type="family">Basu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kiran</namePart>
<namePart type="family">Kate</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danish</namePart>
<namePart type="family">Contractor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>Large language models (LLMs) increasingly rely on external tools and APIs to execute complex tasks specified in natural language. Evaluating such tool-calling capabilities in realistic enterprise settings is challenging: APIs are often proprietary, heterogeneous, and difficult to share, limiting reproducible benchmarks. To address this, we introduce Live API Bench, a comprehensive benchmark constructed by transforming NL2SQL datasets into interactive API environments. Our pipeline converts SQL queries from BIRD-SQL into executable API sequences across three formulations—SLOT, SEL, and REST—covering minimal general-purpose operations, domain-specific multi-step tasks, and function-oriented RESTful interactions, respectively. The benchmark spans 11 databases with over 2,500 invocable tools, paired with human-authored queries, ground-truth API sequences, and verified final answers. Live API Bench enables systematic evaluation of core challenges in tool use, including error handling, sequential reasoning, parameter generation, response parsing, and robustness across diverse domains. We evaluate 10 LLMs and 4 ReACT agents, observing low task completion rates (7–47%), which improve modestly to 50% under interactive agent settings, highlighting substantial scope for improving LLM tool-calling performance. We release all code and data associated with this paper.</abstract>
<identifier type="citekey">elder-etal-2026-live</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.143/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>3092</start>
<end>3124</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Live API-Bench: 2500+ Live APIs for Testing Multi-Step Tool Calling
%A Elder, Benjamin
%A Murthi, Anupama
%A Kang, Jungkoo
%A Naik, Ankita
%A Basu, Kinjal
%A Kate, Kiran
%A Contractor, Danish
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F elder-etal-2026-live
%X Large language models (LLMs) increasingly rely on external tools and APIs to execute complex tasks specified in natural language. Evaluating such tool-calling capabilities in realistic enterprise settings is challenging: APIs are often proprietary, heterogeneous, and difficult to share, limiting reproducible benchmarks. To address this, we introduce Live API Bench, a comprehensive benchmark constructed by transforming NL2SQL datasets into interactive API environments. Our pipeline converts SQL queries from BIRD-SQL into executable API sequences across three formulations—SLOT, SEL, and REST—covering minimal general-purpose operations, domain-specific multi-step tasks, and function-oriented RESTful interactions, respectively. The benchmark spans 11 databases with over 2,500 invocable tools, paired with human-authored queries, ground-truth API sequences, and verified final answers. Live API Bench enables systematic evaluation of core challenges in tool use, including error handling, sequential reasoning, parameter generation, response parsing, and robustness across diverse domains. We evaluate 10 LLMs and 4 ReACT agents, observing low task completion rates (7–47%), which improve modestly to 50% under interactive agent settings, highlighting substantial scope for improving LLM tool-calling performance. We release all code and data associated with this paper.
%U https://aclanthology.org/2026.eacl-long.143/
%P 3092-3124
Markdown (Informal)
[Live API-Bench: 2500+ Live APIs for Testing Multi-Step Tool Calling](https://aclanthology.org/2026.eacl-long.143/) (Elder et al., EACL 2026)
ACL
- Benjamin Elder, Anupama Murthi, Jungkoo Kang, Ankita Naik, Kinjal Basu, Kiran Kate, and Danish Contractor. 2026. Live API-Bench: 2500+ Live APIs for Testing Multi-Step Tool Calling. In Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers), pages 3092–3124, Rabat, Morocco. Association for Computational Linguistics.