@inproceedings{jang-etal-2026-irpo,
title = "{IRPO}: Implicit Policy Regularized Preference Optimization",
author = "Jang, Youngsoo and
Kim, Yu Jin and
Kim, Geon-Hyeong and
Lee, Honglak and
Lee, Moontae",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.281/",
pages = "5304--5325",
ISBN = "979-8-89176-386-9",
abstract = "Training complexity often scales with the size of hyperparameter space for Large Language Models (LLMs). While Direct Preference Optimization (DPO) offers learning stability through reparameterizing the reward function, its regularization against the reference policy can lead to suboptimal outcomes when the reference policy is not optimal. Recent DPO variants address this concern, but at a cost: they introduce additional hyperparameters, reducing feasibility for LLM fine-tuning. To overcome this challenge, we introduce Implicit policy Regularized Preference Optimization (IRPO), which tackles suboptimality while maintaining training simplicity. By treating the winning policy that generated the chosen responses in a pairwise dataset as an implicit policy, IRPO maximizes KL-regularized reward without extra hyperparameters. Then we propose a novel PO algorithm that directly optimizes the IRPO objective by estimating the likelihood ratio between implicit policies. As the winning policy generally outperforms the reference policy, IRPO can effectively address suboptimality. Our experiments show that IRPO significantly outperforms baseline algorithms with the same hyperparameter complexity. Moreover, IRPO demonstrates comparable performance to recent algorithms that rely on a larger number of hyperparameters, offering a practical solution for scalable LLM fine-tuning."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jang-etal-2026-irpo">
<titleInfo>
<title>IRPO: Implicit Policy Regularized Preference Optimization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Youngsoo</namePart>
<namePart type="family">Jang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="given">Jin</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Geon-Hyeong</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Honglak</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Moontae</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>Training complexity often scales with the size of hyperparameter space for Large Language Models (LLMs). While Direct Preference Optimization (DPO) offers learning stability through reparameterizing the reward function, its regularization against the reference policy can lead to suboptimal outcomes when the reference policy is not optimal. Recent DPO variants address this concern, but at a cost: they introduce additional hyperparameters, reducing feasibility for LLM fine-tuning. To overcome this challenge, we introduce Implicit policy Regularized Preference Optimization (IRPO), which tackles suboptimality while maintaining training simplicity. By treating the winning policy that generated the chosen responses in a pairwise dataset as an implicit policy, IRPO maximizes KL-regularized reward without extra hyperparameters. Then we propose a novel PO algorithm that directly optimizes the IRPO objective by estimating the likelihood ratio between implicit policies. As the winning policy generally outperforms the reference policy, IRPO can effectively address suboptimality. Our experiments show that IRPO significantly outperforms baseline algorithms with the same hyperparameter complexity. Moreover, IRPO demonstrates comparable performance to recent algorithms that rely on a larger number of hyperparameters, offering a practical solution for scalable LLM fine-tuning.</abstract>
<identifier type="citekey">jang-etal-2026-irpo</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.281/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>5304</start>
<end>5325</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T IRPO: Implicit Policy Regularized Preference Optimization
%A Jang, Youngsoo
%A Kim, Yu Jin
%A Kim, Geon-Hyeong
%A Lee, Honglak
%A Lee, Moontae
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F jang-etal-2026-irpo
%X Training complexity often scales with the size of hyperparameter space for Large Language Models (LLMs). While Direct Preference Optimization (DPO) offers learning stability through reparameterizing the reward function, its regularization against the reference policy can lead to suboptimal outcomes when the reference policy is not optimal. Recent DPO variants address this concern, but at a cost: they introduce additional hyperparameters, reducing feasibility for LLM fine-tuning. To overcome this challenge, we introduce Implicit policy Regularized Preference Optimization (IRPO), which tackles suboptimality while maintaining training simplicity. By treating the winning policy that generated the chosen responses in a pairwise dataset as an implicit policy, IRPO maximizes KL-regularized reward without extra hyperparameters. Then we propose a novel PO algorithm that directly optimizes the IRPO objective by estimating the likelihood ratio between implicit policies. As the winning policy generally outperforms the reference policy, IRPO can effectively address suboptimality. Our experiments show that IRPO significantly outperforms baseline algorithms with the same hyperparameter complexity. Moreover, IRPO demonstrates comparable performance to recent algorithms that rely on a larger number of hyperparameters, offering a practical solution for scalable LLM fine-tuning.
%U https://aclanthology.org/2026.findings-eacl.281/
%P 5304-5325
Markdown (Informal)
[IRPO: Implicit Policy Regularized Preference Optimization](https://aclanthology.org/2026.findings-eacl.281/) (Jang et al., Findings 2026)
ACL
- Youngsoo Jang, Yu Jin Kim, Geon-Hyeong Kim, Honglak Lee, and Moontae Lee. 2026. IRPO: Implicit Policy Regularized Preference Optimization. In Findings of the Association for Computational Linguistics: EACL 2026, pages 5304–5325, Rabat, Morocco. Association for Computational Linguistics.