@inproceedings{zhou-etal-2026-lepo,
title = "{LEPO}: Latent Reasoning Policy Optimization for Large Language Models",
author = "Zhou, Yuyan and
Yu, Jiarui and
Dong, Hande and
Hao, Zhezheng and
Wang, Hong and
Zhang, Jianqing and
Lin, Qiang",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.707/",
pages = "14416--14427",
ISBN = "979-8-89176-395-1",
abstract = "Recently, latent reasoning has been introduced into large language models (LLMs) to leverage rich information within a continuous space.However, without stochastic sampling, these methods inevitably collapse to deterministic inference, failing to discover diverse reasoning paths.To bridge the gap, we inject controllable stochasticity into latent reasoning via Gumbel-Softmax, restoring LLMs' exploratory capacity and enhancing their compatibility with Reinforcement Learning (RL).Building on this, we propose **L**atent R**e**asoning **P**olicy **O**ptimization (**LEPO**), a novel framework that applies RL directly to continuous latent representations.Specifically, in rollout stage, LEPO maintains stochasticity to enable diverse trajectory sampling, while in optimization stage, LEPO constructs a unified gradient estimation for both latent representations and discrete tokens."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhou-etal-2026-lepo">
<titleInfo>
<title>LEPO: Latent Reasoning Policy Optimization for Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuyan</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiarui</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hande</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhezheng</namePart>
<namePart type="family">Hao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianqing</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qiang</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Recently, latent reasoning has been introduced into large language models (LLMs) to leverage rich information within a continuous space.However, without stochastic sampling, these methods inevitably collapse to deterministic inference, failing to discover diverse reasoning paths.To bridge the gap, we inject controllable stochasticity into latent reasoning via Gumbel-Softmax, restoring LLMs’ exploratory capacity and enhancing their compatibility with Reinforcement Learning (RL).Building on this, we propose **L**atent R**e**asoning **P**olicy **O**ptimization (**LEPO**), a novel framework that applies RL directly to continuous latent representations.Specifically, in rollout stage, LEPO maintains stochasticity to enable diverse trajectory sampling, while in optimization stage, LEPO constructs a unified gradient estimation for both latent representations and discrete tokens.</abstract>
<identifier type="citekey">zhou-etal-2026-lepo</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.707/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>14416</start>
<end>14427</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LEPO: Latent Reasoning Policy Optimization for Large Language Models
%A Zhou, Yuyan
%A Yu, Jiarui
%A Dong, Hande
%A Hao, Zhezheng
%A Wang, Hong
%A Zhang, Jianqing
%A Lin, Qiang
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F zhou-etal-2026-lepo
%X Recently, latent reasoning has been introduced into large language models (LLMs) to leverage rich information within a continuous space.However, without stochastic sampling, these methods inevitably collapse to deterministic inference, failing to discover diverse reasoning paths.To bridge the gap, we inject controllable stochasticity into latent reasoning via Gumbel-Softmax, restoring LLMs’ exploratory capacity and enhancing their compatibility with Reinforcement Learning (RL).Building on this, we propose **L**atent R**e**asoning **P**olicy **O**ptimization (**LEPO**), a novel framework that applies RL directly to continuous latent representations.Specifically, in rollout stage, LEPO maintains stochasticity to enable diverse trajectory sampling, while in optimization stage, LEPO constructs a unified gradient estimation for both latent representations and discrete tokens.
%U https://aclanthology.org/2026.findings-acl.707/
%P 14416-14427
Markdown (Informal)
[LEPO: Latent Reasoning Policy Optimization for Large Language Models](https://aclanthology.org/2026.findings-acl.707/) (Zhou et al., Findings 2026)
ACL
- Yuyan Zhou, Jiarui Yu, Hande Dong, Zhezheng Hao, Hong Wang, Jianqing Zhang, and Qiang Lin. 2026. LEPO: Latent Reasoning Policy Optimization for Large Language Models. In Findings of the Association for Computational Linguistics: ACL 2026, pages 14416–14427, San Diego, California, United States. Association for Computational Linguistics.