@inproceedings{polyakov-etal-2025-toolreflection,
title = "{T}ool{R}eflection: Improving Large Language Models for Real-World {API} Calls with Self-Generated Data",
author = "Polyakov, Gregory and
Alimova, Ilseyar and
Abulkhanov, Dmitry and
Sedykh, Ivan and
Bout, Andrey and
Nikolenko, Sergey and
Piontkovskaya, Irina",
editor = "Kamalloo, Ehsan and
Gontier, Nicolas and
Lu, Xing Han and
Dziri, Nouha and
Murty, Shikhar and
Lacoste, Alexandre",
booktitle = "Proceedings of the 1st Workshop for Research on Agent Language Models (REALM 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.realm-1.14/",
doi = "10.18653/v1/2025.realm-1.14",
pages = "184--199",
ISBN = "979-8-89176-264-0",
abstract = "While open-source large language models (LLMs) have advanced in leveraging third-party tools, significant challenges remain in real-world API usage, where behavior is unpredictable or poorly specified. Existing benchmarks often fail to capture this complexity. We propose ToolReflection, a novel method that improves LLMs' ability to self-correct API calls by utilizing real-time API feedback. We also introduce new datasets specifically designed to test model performance under realistic conditions. In ToolReflection, models undergo instruction tuning on a dataset augmented with self-generated errors and corrections. Our evaluation across ToolAlpaca, ToolBench benchmarks, and three newly developed datasets (GPT4Tools-OOD, GPT4Tools-OOD-Hard, and Multistep-100) demonstrates its effectiveness. ToolReflection boosts overall success rates by 25.4{\%} on GPT4Tools-OOD, 56.2{\%} on GPT4Tools-OOD-Hard, and 4{\%} on Multistep-100, outperforming original models. On ToolAlpaca, we show a 14{\%} improvement in the ``Simulated'' setting and 10.5{\%} in the ``Real-world'' scenario. Our error analysis highlights ToolReflection significantly enhances recovery from incorrect tool calls, even with incomplete or erroneous API documentation. We have released the code, prompts, and data at https://github.com/polgrisha/ToolReflection."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="polyakov-etal-2025-toolreflection">
<titleInfo>
<title>ToolReflection: Improving Large Language Models for Real-World API Calls with Self-Generated Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gregory</namePart>
<namePart type="family">Polyakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ilseyar</namePart>
<namePart type="family">Alimova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dmitry</namePart>
<namePart type="family">Abulkhanov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Sedykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrey</namePart>
<namePart type="family">Bout</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergey</namePart>
<namePart type="family">Nikolenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Irina</namePart>
<namePart type="family">Piontkovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop for Research on Agent Language Models (REALM 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ehsan</namePart>
<namePart type="family">Kamalloo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicolas</namePart>
<namePart type="family">Gontier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xing</namePart>
<namePart type="given">Han</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nouha</namePart>
<namePart type="family">Dziri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shikhar</namePart>
<namePart type="family">Murty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandre</namePart>
<namePart type="family">Lacoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-264-0</identifier>
</relatedItem>
<abstract>While open-source large language models (LLMs) have advanced in leveraging third-party tools, significant challenges remain in real-world API usage, where behavior is unpredictable or poorly specified. Existing benchmarks often fail to capture this complexity. We propose ToolReflection, a novel method that improves LLMs’ ability to self-correct API calls by utilizing real-time API feedback. We also introduce new datasets specifically designed to test model performance under realistic conditions. In ToolReflection, models undergo instruction tuning on a dataset augmented with self-generated errors and corrections. Our evaluation across ToolAlpaca, ToolBench benchmarks, and three newly developed datasets (GPT4Tools-OOD, GPT4Tools-OOD-Hard, and Multistep-100) demonstrates its effectiveness. ToolReflection boosts overall success rates by 25.4% on GPT4Tools-OOD, 56.2% on GPT4Tools-OOD-Hard, and 4% on Multistep-100, outperforming original models. On ToolAlpaca, we show a 14% improvement in the “Simulated” setting and 10.5% in the “Real-world” scenario. Our error analysis highlights ToolReflection significantly enhances recovery from incorrect tool calls, even with incomplete or erroneous API documentation. We have released the code, prompts, and data at https://github.com/polgrisha/ToolReflection.</abstract>
<identifier type="citekey">polyakov-etal-2025-toolreflection</identifier>
<identifier type="doi">10.18653/v1/2025.realm-1.14</identifier>
<location>
<url>https://aclanthology.org/2025.realm-1.14/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>184</start>
<end>199</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ToolReflection: Improving Large Language Models for Real-World API Calls with Self-Generated Data
%A Polyakov, Gregory
%A Alimova, Ilseyar
%A Abulkhanov, Dmitry
%A Sedykh, Ivan
%A Bout, Andrey
%A Nikolenko, Sergey
%A Piontkovskaya, Irina
%Y Kamalloo, Ehsan
%Y Gontier, Nicolas
%Y Lu, Xing Han
%Y Dziri, Nouha
%Y Murty, Shikhar
%Y Lacoste, Alexandre
%S Proceedings of the 1st Workshop for Research on Agent Language Models (REALM 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-264-0
%F polyakov-etal-2025-toolreflection
%X While open-source large language models (LLMs) have advanced in leveraging third-party tools, significant challenges remain in real-world API usage, where behavior is unpredictable or poorly specified. Existing benchmarks often fail to capture this complexity. We propose ToolReflection, a novel method that improves LLMs’ ability to self-correct API calls by utilizing real-time API feedback. We also introduce new datasets specifically designed to test model performance under realistic conditions. In ToolReflection, models undergo instruction tuning on a dataset augmented with self-generated errors and corrections. Our evaluation across ToolAlpaca, ToolBench benchmarks, and three newly developed datasets (GPT4Tools-OOD, GPT4Tools-OOD-Hard, and Multistep-100) demonstrates its effectiveness. ToolReflection boosts overall success rates by 25.4% on GPT4Tools-OOD, 56.2% on GPT4Tools-OOD-Hard, and 4% on Multistep-100, outperforming original models. On ToolAlpaca, we show a 14% improvement in the “Simulated” setting and 10.5% in the “Real-world” scenario. Our error analysis highlights ToolReflection significantly enhances recovery from incorrect tool calls, even with incomplete or erroneous API documentation. We have released the code, prompts, and data at https://github.com/polgrisha/ToolReflection.
%R 10.18653/v1/2025.realm-1.14
%U https://aclanthology.org/2025.realm-1.14/
%U https://doi.org/10.18653/v1/2025.realm-1.14
%P 184-199
Markdown (Informal)
[ToolReflection: Improving Large Language Models for Real-World API Calls with Self-Generated Data](https://aclanthology.org/2025.realm-1.14/) (Polyakov et al., REALM 2025)
ACL