@inproceedings{chen-etal-2026-step,
title = "{STEP}: Success-Rate-Aware Trajectory-Efficient Policy Optimization",
author = "Chen, Yuhan and
Liu, Yuxuan and
Zhang, Long and
Gao, Pengzhi and
Luan, Jian and
Liu, Wei",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1532/",
pages = "30681--30692",
ISBN = "979-8-89176-395-1",
abstract = "Multi-turn interaction remains challenging for online reinforcement learning. Current GRPO-based methods{---}either at the trajectory level or the step level{---}still suffer from fundamental challenges in multi-turn settings: they allocate sampling uniformly across tasks regardless of difficulty, propagate misleading learning signals that penalize correct intermediate actions in failed trajectories, and incur high sample-collection costs under long-horizon environments. Step-level variants (e.g., GIGPO) mitigate some interaction-cost constraints by decomposing trajectories, yet they retain GRPO{'}s sampling imbalance and still struggle with heterogeneous multi-turn tasks. To address these issues, we propose STEP (Success-rate-aware Trajectory-Efficient Policy Optimization), a framework that dynamically allocates sampling based on per-task success rates and performs fine-grained step-level optimization. STEP maintains a smoothed success-rate record to guide adaptive trajectory resampling, allocating more effort to harder tasks. It then computes success-rate-weighted advantages and decomposes trajectories into step-level samples, followed by a step-level GRPO augmentation that strengthens updates on low-success tasks. Experiments on OSWorld and AndroidWorld show that STEP substantially improves sample efficiency and training stability over both trajectory-level and existing step-level GRPO variants, converging faster and generalizing better under the same sampling budget."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2026-step">
<titleInfo>
<title>STEP: Success-Rate-Aware Trajectory-Efficient Policy Optimization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuhan</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuxuan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Long</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pengzhi</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Luan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Multi-turn interaction remains challenging for online reinforcement learning. Current GRPO-based methods—either at the trajectory level or the step level—still suffer from fundamental challenges in multi-turn settings: they allocate sampling uniformly across tasks regardless of difficulty, propagate misleading learning signals that penalize correct intermediate actions in failed trajectories, and incur high sample-collection costs under long-horizon environments. Step-level variants (e.g., GIGPO) mitigate some interaction-cost constraints by decomposing trajectories, yet they retain GRPO’s sampling imbalance and still struggle with heterogeneous multi-turn tasks. To address these issues, we propose STEP (Success-rate-aware Trajectory-Efficient Policy Optimization), a framework that dynamically allocates sampling based on per-task success rates and performs fine-grained step-level optimization. STEP maintains a smoothed success-rate record to guide adaptive trajectory resampling, allocating more effort to harder tasks. It then computes success-rate-weighted advantages and decomposes trajectories into step-level samples, followed by a step-level GRPO augmentation that strengthens updates on low-success tasks. Experiments on OSWorld and AndroidWorld show that STEP substantially improves sample efficiency and training stability over both trajectory-level and existing step-level GRPO variants, converging faster and generalizing better under the same sampling budget.</abstract>
<identifier type="citekey">chen-etal-2026-step</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1532/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>30681</start>
<end>30692</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T STEP: Success-Rate-Aware Trajectory-Efficient Policy Optimization
%A Chen, Yuhan
%A Liu, Yuxuan
%A Zhang, Long
%A Gao, Pengzhi
%A Luan, Jian
%A Liu, Wei
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F chen-etal-2026-step
%X Multi-turn interaction remains challenging for online reinforcement learning. Current GRPO-based methods—either at the trajectory level or the step level—still suffer from fundamental challenges in multi-turn settings: they allocate sampling uniformly across tasks regardless of difficulty, propagate misleading learning signals that penalize correct intermediate actions in failed trajectories, and incur high sample-collection costs under long-horizon environments. Step-level variants (e.g., GIGPO) mitigate some interaction-cost constraints by decomposing trajectories, yet they retain GRPO’s sampling imbalance and still struggle with heterogeneous multi-turn tasks. To address these issues, we propose STEP (Success-rate-aware Trajectory-Efficient Policy Optimization), a framework that dynamically allocates sampling based on per-task success rates and performs fine-grained step-level optimization. STEP maintains a smoothed success-rate record to guide adaptive trajectory resampling, allocating more effort to harder tasks. It then computes success-rate-weighted advantages and decomposes trajectories into step-level samples, followed by a step-level GRPO augmentation that strengthens updates on low-success tasks. Experiments on OSWorld and AndroidWorld show that STEP substantially improves sample efficiency and training stability over both trajectory-level and existing step-level GRPO variants, converging faster and generalizing better under the same sampling budget.
%U https://aclanthology.org/2026.findings-acl.1532/
%P 30681-30692
Markdown (Informal)
[STEP: Success-Rate-Aware Trajectory-Efficient Policy Optimization](https://aclanthology.org/2026.findings-acl.1532/) (Chen et al., Findings 2026)
ACL