@inproceedings{zheng-etal-2025-planningarena,
title = "{P}lanning{A}rena: A Modular Benchmark for Multidimensional Evaluation of Planning and Tool Learning",
author = "Zheng, Zihan and
Cui, Tianle and
Xie, Chuwen and
Pan, Jiahui and
Chen, Qianglong and
He, Lewei",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1499/",
doi = "10.18653/v1/2025.acl-long.1499",
pages = "31047--31086",
ISBN = "979-8-89176-251-0",
abstract = "One of the research focuses of large language models (LLMs) is the ability to generate action plans. Recent studies have revealed that the performance of LLMs can be significantly improved by integrating external tools. Based on this, we propose a benchmark framework called PlanningArena, which aims to simulate real application scenarios and provide a series of apps and API tools that may be involved in the actual planning process. This framework adopts a modular task structure and combines user portrait analysis to evaluate the ability of LLMs in correctly selecting tools, logical reasoning in complex scenarios, and parsing user information. In addition, we deeply diagnose the task execution effect of LLMs from both macro and micro levels. The experimental results show that even the most outstanding GPT-4o and DeepSeekV3 models only achieved a total score of 56.5{\%} and 41.9{\%} in PlanningArena, respectively, indicating that current LLMs still face challenges in logical reasoning, context memory, and tool calling when dealing with different structures, scenarios, and their complexity. Through this benchmark, we further explore the path to optimize LLMs to perform planning tasks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zheng-etal-2025-planningarena">
<titleInfo>
<title>PlanningArena: A Modular Benchmark for Multidimensional Evaluation of Planning and Tool Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zihan</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianle</namePart>
<namePart type="family">Cui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chuwen</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiahui</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qianglong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lewei</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>One of the research focuses of large language models (LLMs) is the ability to generate action plans. Recent studies have revealed that the performance of LLMs can be significantly improved by integrating external tools. Based on this, we propose a benchmark framework called PlanningArena, which aims to simulate real application scenarios and provide a series of apps and API tools that may be involved in the actual planning process. This framework adopts a modular task structure and combines user portrait analysis to evaluate the ability of LLMs in correctly selecting tools, logical reasoning in complex scenarios, and parsing user information. In addition, we deeply diagnose the task execution effect of LLMs from both macro and micro levels. The experimental results show that even the most outstanding GPT-4o and DeepSeekV3 models only achieved a total score of 56.5% and 41.9% in PlanningArena, respectively, indicating that current LLMs still face challenges in logical reasoning, context memory, and tool calling when dealing with different structures, scenarios, and their complexity. Through this benchmark, we further explore the path to optimize LLMs to perform planning tasks.</abstract>
<identifier type="citekey">zheng-etal-2025-planningarena</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1499</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1499/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>31047</start>
<end>31086</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PlanningArena: A Modular Benchmark for Multidimensional Evaluation of Planning and Tool Learning
%A Zheng, Zihan
%A Cui, Tianle
%A Xie, Chuwen
%A Pan, Jiahui
%A Chen, Qianglong
%A He, Lewei
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F zheng-etal-2025-planningarena
%X One of the research focuses of large language models (LLMs) is the ability to generate action plans. Recent studies have revealed that the performance of LLMs can be significantly improved by integrating external tools. Based on this, we propose a benchmark framework called PlanningArena, which aims to simulate real application scenarios and provide a series of apps and API tools that may be involved in the actual planning process. This framework adopts a modular task structure and combines user portrait analysis to evaluate the ability of LLMs in correctly selecting tools, logical reasoning in complex scenarios, and parsing user information. In addition, we deeply diagnose the task execution effect of LLMs from both macro and micro levels. The experimental results show that even the most outstanding GPT-4o and DeepSeekV3 models only achieved a total score of 56.5% and 41.9% in PlanningArena, respectively, indicating that current LLMs still face challenges in logical reasoning, context memory, and tool calling when dealing with different structures, scenarios, and their complexity. Through this benchmark, we further explore the path to optimize LLMs to perform planning tasks.
%R 10.18653/v1/2025.acl-long.1499
%U https://aclanthology.org/2025.acl-long.1499/
%U https://doi.org/10.18653/v1/2025.acl-long.1499
%P 31047-31086
Markdown (Informal)
[PlanningArena: A Modular Benchmark for Multidimensional Evaluation of Planning and Tool Learning](https://aclanthology.org/2025.acl-long.1499/) (Zheng et al., ACL 2025)
ACL