@inproceedings{zuo-etal-2025-planetarium,
title = "Planetarium: A Rigorous Benchmark for Translating Text to Structured Planning Languages",
author = "Zuo, Max and
Velez, Francisco Piedrahita and
Li, Xiaochen and
Littman, Michael and
Bach, Stephen",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.560/",
doi = "10.18653/v1/2025.naacl-long.560",
pages = "11223--11240",
ISBN = "979-8-89176-189-6",
abstract = "Recent works have explored using language models for planning problems. One approach examines translating natural language descriptions of planning tasks into structured planning languages, such as the planning domain definition language (PDDL). Existing evaluation methods struggle to ensure semantic correctness and rely on simple or unrealistic datasets. To bridge this gap, we introduce Planetarium, a benchmark designed to evaluate language models' ability to generate PDDL code from natural language descriptions of planning tasks. Planetarium features a novel PDDL equivalence algorithm that flexibly evaluates the correctness of generated PDDL against ground truth, along with a dataset of 145,918 text-to-PDDL pairs across 73 unique state combinations with varying levels of difficulty. Finally, we evaluate several API-access and open-weight language models that reveal this task{'}s complexity. For example, 96.1{\%} of the PDDL problem descriptions generated by GPT-4o are syntactically parseable, 94.4{\%} are solvable, but only 24.8{\%} are semantically correct, highlighting the need for a more rigorous benchmark for this problem."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zuo-etal-2025-planetarium">
<titleInfo>
<title>Planetarium: A Rigorous Benchmark for Translating Text to Structured Planning Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Max</namePart>
<namePart type="family">Zuo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francisco</namePart>
<namePart type="given">Piedrahita</namePart>
<namePart type="family">Velez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaochen</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Littman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stephen</namePart>
<namePart type="family">Bach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Recent works have explored using language models for planning problems. One approach examines translating natural language descriptions of planning tasks into structured planning languages, such as the planning domain definition language (PDDL). Existing evaluation methods struggle to ensure semantic correctness and rely on simple or unrealistic datasets. To bridge this gap, we introduce Planetarium, a benchmark designed to evaluate language models’ ability to generate PDDL code from natural language descriptions of planning tasks. Planetarium features a novel PDDL equivalence algorithm that flexibly evaluates the correctness of generated PDDL against ground truth, along with a dataset of 145,918 text-to-PDDL pairs across 73 unique state combinations with varying levels of difficulty. Finally, we evaluate several API-access and open-weight language models that reveal this task’s complexity. For example, 96.1% of the PDDL problem descriptions generated by GPT-4o are syntactically parseable, 94.4% are solvable, but only 24.8% are semantically correct, highlighting the need for a more rigorous benchmark for this problem.</abstract>
<identifier type="citekey">zuo-etal-2025-planetarium</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.560</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.560/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>11223</start>
<end>11240</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Planetarium: A Rigorous Benchmark for Translating Text to Structured Planning Languages
%A Zuo, Max
%A Velez, Francisco Piedrahita
%A Li, Xiaochen
%A Littman, Michael
%A Bach, Stephen
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F zuo-etal-2025-planetarium
%X Recent works have explored using language models for planning problems. One approach examines translating natural language descriptions of planning tasks into structured planning languages, such as the planning domain definition language (PDDL). Existing evaluation methods struggle to ensure semantic correctness and rely on simple or unrealistic datasets. To bridge this gap, we introduce Planetarium, a benchmark designed to evaluate language models’ ability to generate PDDL code from natural language descriptions of planning tasks. Planetarium features a novel PDDL equivalence algorithm that flexibly evaluates the correctness of generated PDDL against ground truth, along with a dataset of 145,918 text-to-PDDL pairs across 73 unique state combinations with varying levels of difficulty. Finally, we evaluate several API-access and open-weight language models that reveal this task’s complexity. For example, 96.1% of the PDDL problem descriptions generated by GPT-4o are syntactically parseable, 94.4% are solvable, but only 24.8% are semantically correct, highlighting the need for a more rigorous benchmark for this problem.
%R 10.18653/v1/2025.naacl-long.560
%U https://aclanthology.org/2025.naacl-long.560/
%U https://doi.org/10.18653/v1/2025.naacl-long.560
%P 11223-11240
Markdown (Informal)
[Planetarium: A Rigorous Benchmark for Translating Text to Structured Planning Languages](https://aclanthology.org/2025.naacl-long.560/) (Zuo et al., NAACL 2025)
ACL
- Max Zuo, Francisco Piedrahita Velez, Xiaochen Li, Michael Littman, and Stephen Bach. 2025. Planetarium: A Rigorous Benchmark for Translating Text to Structured Planning Languages. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 11223–11240, Albuquerque, New Mexico. Association for Computational Linguistics.