@inproceedings{hu-etal-2026-reasoning,
title = "Reasoning-Guided Exploration for Online {DPO}",
author = "Hu, Zetian and
Liu, Shunyu and
Lin, Ting-En and
Huang, Fei and
Li, Yongbin and
Tao, Dacheng",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1370/",
pages = "27526--27542",
ISBN = "979-8-89176-395-1",
abstract = "Recent work has aimed to enhance the reasoning capabilities of language models, but these methods are often limited to domains with objectively verifiable answers. To overcome this limitation, we introduce Reasoning-Guided Exploration for Online DPO (RGE-DPO), a novel self-play framework designed to improve reasoning on general-domain data. RGE-DPO employs a dual-reward mechanism to evaluate responses by assessing: (1) reasoning quality using a self-rewarding rubric that provides structured evaluation of logical coherence, reasoning depth, and verification behaviors; and (2) response quality using an established reward model trained for aspects like helpfulness and correctness. These two orthogonal evaluation signals enable a comprehensive assessment of different response dimensions without conflating reasoning processes with response content. We then integrate these two evaluation signals based on a weighted ranking mechanism to construct the preference pairs, which ensures that responses with superior reasoning processes are preferred when response quality is comparable. Experiments demonstrate that RGE-DPO achieves substantial improvements in instruction-following benchmark while maintaining competitive performance on verifiable academic benchmarks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hu-etal-2026-reasoning">
<titleInfo>
<title>Reasoning-Guided Exploration for Online DPO</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zetian</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shunyu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ting-En</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yongbin</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dacheng</namePart>
<namePart type="family">Tao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Recent work has aimed to enhance the reasoning capabilities of language models, but these methods are often limited to domains with objectively verifiable answers. To overcome this limitation, we introduce Reasoning-Guided Exploration for Online DPO (RGE-DPO), a novel self-play framework designed to improve reasoning on general-domain data. RGE-DPO employs a dual-reward mechanism to evaluate responses by assessing: (1) reasoning quality using a self-rewarding rubric that provides structured evaluation of logical coherence, reasoning depth, and verification behaviors; and (2) response quality using an established reward model trained for aspects like helpfulness and correctness. These two orthogonal evaluation signals enable a comprehensive assessment of different response dimensions without conflating reasoning processes with response content. We then integrate these two evaluation signals based on a weighted ranking mechanism to construct the preference pairs, which ensures that responses with superior reasoning processes are preferred when response quality is comparable. Experiments demonstrate that RGE-DPO achieves substantial improvements in instruction-following benchmark while maintaining competitive performance on verifiable academic benchmarks.</abstract>
<identifier type="citekey">hu-etal-2026-reasoning</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1370/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>27526</start>
<end>27542</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Reasoning-Guided Exploration for Online DPO
%A Hu, Zetian
%A Liu, Shunyu
%A Lin, Ting-En
%A Huang, Fei
%A Li, Yongbin
%A Tao, Dacheng
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F hu-etal-2026-reasoning
%X Recent work has aimed to enhance the reasoning capabilities of language models, but these methods are often limited to domains with objectively verifiable answers. To overcome this limitation, we introduce Reasoning-Guided Exploration for Online DPO (RGE-DPO), a novel self-play framework designed to improve reasoning on general-domain data. RGE-DPO employs a dual-reward mechanism to evaluate responses by assessing: (1) reasoning quality using a self-rewarding rubric that provides structured evaluation of logical coherence, reasoning depth, and verification behaviors; and (2) response quality using an established reward model trained for aspects like helpfulness and correctness. These two orthogonal evaluation signals enable a comprehensive assessment of different response dimensions without conflating reasoning processes with response content. We then integrate these two evaluation signals based on a weighted ranking mechanism to construct the preference pairs, which ensures that responses with superior reasoning processes are preferred when response quality is comparable. Experiments demonstrate that RGE-DPO achieves substantial improvements in instruction-following benchmark while maintaining competitive performance on verifiable academic benchmarks.
%U https://aclanthology.org/2026.findings-acl.1370/
%P 27526-27542
Markdown (Informal)
[Reasoning-Guided Exploration for Online DPO](https://aclanthology.org/2026.findings-acl.1370/) (Hu et al., Findings 2026)
ACL
- Zetian Hu, Shunyu Liu, Ting-En Lin, Fei Huang, Yongbin Li, and Dacheng Tao. 2026. Reasoning-Guided Exploration for Online DPO. In Findings of the Association for Computational Linguistics: ACL 2026, pages 27526–27542, San Diego, California, United States. Association for Computational Linguistics.