@inproceedings{zhang-etal-2025-copr,
title = "{COPR}: Continual Human Preference Learning via Optimal Policy Regularization",
author = "Zhang, Han and
Gui, Lin and
Lei, Yu and
Zhai, Yuanzhao and
Zhang, Yehong and
Zhang, Zhuo and
He, Yulan and
Wang, Hui and
Yu, Yue and
Wong, Kam-Fai and
Liang, Bin and
Xu, Ruifeng",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.281/",
doi = "10.18653/v1/2025.findings-acl.281",
pages = "5377--5398",
ISBN = "979-8-89176-256-5",
abstract = "Reinforcement Learning from Human Feedback (RLHF) is effective for aligning Large Language Models (LLMs) with human preferences. However, RLHF{'}s complex process limits its ability to continually learn human feedback, making it impractical for real-world applications where the deployed model continuously receives feedback from users. The non-RL-based method, such as Direct Preference Optimization (DPO), is not primitively favorable for Continual Learning (CL). We observe that when combined with Experiment Relay (ER) for CL, DPO tends to significantly widen the gap in the probability of human-preferred and dispreferred responses. Consequently, this diminishes the diversity in model generation, potentially leading to model collapse. To overcome the above challenges, we propose the Continual Optimal Policy Regularization (COPR), a novel non-RL offline method to convert the historical optimal policies into optimization constraints when continually learning new preferences. We first derive a moderate reward function from the pairwise ranking loss and then use the moderate reward to calculate a new sampling distribution to construct novel learning objectives and constraints. We also provide formal proof of the learnability of COPR. The experimental results show that COPR outperforms strong CL baselines on our proposed benchmark, in terms of reward-based, GPT-4 evaluations and human assessment."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2025-copr">
<titleInfo>
<title>COPR: Continual Human Preference Learning via Optimal Policy Regularization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Han</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lin</namePart>
<namePart type="family">Gui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Lei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuanzhao</namePart>
<namePart type="family">Zhai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yehong</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuo</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hui</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kam-Fai</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bin</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruifeng</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Reinforcement Learning from Human Feedback (RLHF) is effective for aligning Large Language Models (LLMs) with human preferences. However, RLHF’s complex process limits its ability to continually learn human feedback, making it impractical for real-world applications where the deployed model continuously receives feedback from users. The non-RL-based method, such as Direct Preference Optimization (DPO), is not primitively favorable for Continual Learning (CL). We observe that when combined with Experiment Relay (ER) for CL, DPO tends to significantly widen the gap in the probability of human-preferred and dispreferred responses. Consequently, this diminishes the diversity in model generation, potentially leading to model collapse. To overcome the above challenges, we propose the Continual Optimal Policy Regularization (COPR), a novel non-RL offline method to convert the historical optimal policies into optimization constraints when continually learning new preferences. We first derive a moderate reward function from the pairwise ranking loss and then use the moderate reward to calculate a new sampling distribution to construct novel learning objectives and constraints. We also provide formal proof of the learnability of COPR. The experimental results show that COPR outperforms strong CL baselines on our proposed benchmark, in terms of reward-based, GPT-4 evaluations and human assessment.</abstract>
<identifier type="citekey">zhang-etal-2025-copr</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.281</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.281/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>5377</start>
<end>5398</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T COPR: Continual Human Preference Learning via Optimal Policy Regularization
%A Zhang, Han
%A Gui, Lin
%A Lei, Yu
%A Zhai, Yuanzhao
%A Zhang, Yehong
%A Zhang, Zhuo
%A He, Yulan
%A Wang, Hui
%A Yu, Yue
%A Wong, Kam-Fai
%A Liang, Bin
%A Xu, Ruifeng
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F zhang-etal-2025-copr
%X Reinforcement Learning from Human Feedback (RLHF) is effective for aligning Large Language Models (LLMs) with human preferences. However, RLHF’s complex process limits its ability to continually learn human feedback, making it impractical for real-world applications where the deployed model continuously receives feedback from users. The non-RL-based method, such as Direct Preference Optimization (DPO), is not primitively favorable for Continual Learning (CL). We observe that when combined with Experiment Relay (ER) for CL, DPO tends to significantly widen the gap in the probability of human-preferred and dispreferred responses. Consequently, this diminishes the diversity in model generation, potentially leading to model collapse. To overcome the above challenges, we propose the Continual Optimal Policy Regularization (COPR), a novel non-RL offline method to convert the historical optimal policies into optimization constraints when continually learning new preferences. We first derive a moderate reward function from the pairwise ranking loss and then use the moderate reward to calculate a new sampling distribution to construct novel learning objectives and constraints. We also provide formal proof of the learnability of COPR. The experimental results show that COPR outperforms strong CL baselines on our proposed benchmark, in terms of reward-based, GPT-4 evaluations and human assessment.
%R 10.18653/v1/2025.findings-acl.281
%U https://aclanthology.org/2025.findings-acl.281/
%U https://doi.org/10.18653/v1/2025.findings-acl.281
%P 5377-5398
Markdown (Informal)
[COPR: Continual Human Preference Learning via Optimal Policy Regularization](https://aclanthology.org/2025.findings-acl.281/) (Zhang et al., Findings 2025)
ACL
- Han Zhang, Lin Gui, Yu Lei, Yuanzhao Zhai, Yehong Zhang, Zhuo Zhang, Yulan He, Hui Wang, Yue Yu, Kam-Fai Wong, Bin Liang, and Ruifeng Xu. 2025. COPR: Continual Human Preference Learning via Optimal Policy Regularization. In Findings of the Association for Computational Linguistics: ACL 2025, pages 5377–5398, Vienna, Austria. Association for Computational Linguistics.