@inproceedings{feng-etal-2026-pvpo,
title = "{PVPO}: Pre-Estimated Value-Based Policy Optimization for Agentic Reasoning",
author = "Feng, Wenfeng and
Zhao, Penghong and
Jiang, Guochao and
Hao, Chuzhan and
Liu, Guohua and
Zhang, Yuewei",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.182/",
pages = "3729--3748",
ISBN = "979-8-89176-395-1",
abstract = "Grouping-based methods have emerged as a significant frontier in Reinforcement Learning (RL), yet agentic reasoning poses a fundamental challenge for grouping-based methods: frequent environmental interactions and multi-step tool invocation generate highly variable trajectories, rendering intra-group advantage estimation unstable. In response, practitioners resort to excessive rollouts to stabilize training, which in turn incurs prohibitive computational costs. This negative feedback loop between advantage estimation instability and sampling inefficiency severely limits learning performance. We present PVPO, a stable and efficient critic-free RL framework that breaks this cycle through a pre-estimated value baseline and pre-sampled data filtering. Specifically, before training begins, PVPO performs a single round of rollouts to compute two signals: (1) Static V, a Monte Carlo estimate of the expected return that serves as a fixed baseline to stabilize advantage estimation; and (2) sample-level accuracy, as a difficulty metric to filter out trivial samples and inject ground-truth trajectories into hard ones, thereby enhancing training efficiency. As shown in Figure 1, experiments demonstrate that PVPO outperforms other grouping-based methods in both multi-step retrieval tasks and advanced mathematical reasoning benchmarks. Notably, our 7B model trained with PVPO matches or exceeds the performance of large language models (LLMs). Moreover, PVPO achieves a 2.5x speedup in training time compared to prior methods while maintaining comparable final performance."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="feng-etal-2026-pvpo">
<titleInfo>
<title>PVPO: Pre-Estimated Value-Based Policy Optimization for Agentic Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wenfeng</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Penghong</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guochao</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chuzhan</namePart>
<namePart type="family">Hao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guohua</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuewei</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Grouping-based methods have emerged as a significant frontier in Reinforcement Learning (RL), yet agentic reasoning poses a fundamental challenge for grouping-based methods: frequent environmental interactions and multi-step tool invocation generate highly variable trajectories, rendering intra-group advantage estimation unstable. In response, practitioners resort to excessive rollouts to stabilize training, which in turn incurs prohibitive computational costs. This negative feedback loop between advantage estimation instability and sampling inefficiency severely limits learning performance. We present PVPO, a stable and efficient critic-free RL framework that breaks this cycle through a pre-estimated value baseline and pre-sampled data filtering. Specifically, before training begins, PVPO performs a single round of rollouts to compute two signals: (1) Static V, a Monte Carlo estimate of the expected return that serves as a fixed baseline to stabilize advantage estimation; and (2) sample-level accuracy, as a difficulty metric to filter out trivial samples and inject ground-truth trajectories into hard ones, thereby enhancing training efficiency. As shown in Figure 1, experiments demonstrate that PVPO outperforms other grouping-based methods in both multi-step retrieval tasks and advanced mathematical reasoning benchmarks. Notably, our 7B model trained with PVPO matches or exceeds the performance of large language models (LLMs). Moreover, PVPO achieves a 2.5x speedup in training time compared to prior methods while maintaining comparable final performance.</abstract>
<identifier type="citekey">feng-etal-2026-pvpo</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.182/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>3729</start>
<end>3748</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PVPO: Pre-Estimated Value-Based Policy Optimization for Agentic Reasoning
%A Feng, Wenfeng
%A Zhao, Penghong
%A Jiang, Guochao
%A Hao, Chuzhan
%A Liu, Guohua
%A Zhang, Yuewei
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F feng-etal-2026-pvpo
%X Grouping-based methods have emerged as a significant frontier in Reinforcement Learning (RL), yet agentic reasoning poses a fundamental challenge for grouping-based methods: frequent environmental interactions and multi-step tool invocation generate highly variable trajectories, rendering intra-group advantage estimation unstable. In response, practitioners resort to excessive rollouts to stabilize training, which in turn incurs prohibitive computational costs. This negative feedback loop between advantage estimation instability and sampling inefficiency severely limits learning performance. We present PVPO, a stable and efficient critic-free RL framework that breaks this cycle through a pre-estimated value baseline and pre-sampled data filtering. Specifically, before training begins, PVPO performs a single round of rollouts to compute two signals: (1) Static V, a Monte Carlo estimate of the expected return that serves as a fixed baseline to stabilize advantage estimation; and (2) sample-level accuracy, as a difficulty metric to filter out trivial samples and inject ground-truth trajectories into hard ones, thereby enhancing training efficiency. As shown in Figure 1, experiments demonstrate that PVPO outperforms other grouping-based methods in both multi-step retrieval tasks and advanced mathematical reasoning benchmarks. Notably, our 7B model trained with PVPO matches or exceeds the performance of large language models (LLMs). Moreover, PVPO achieves a 2.5x speedup in training time compared to prior methods while maintaining comparable final performance.
%U https://aclanthology.org/2026.findings-acl.182/
%P 3729-3748
Markdown (Informal)
[PVPO: Pre-Estimated Value-Based Policy Optimization for Agentic Reasoning](https://aclanthology.org/2026.findings-acl.182/) (Feng et al., Findings 2026)
ACL