@inproceedings{hang-etal-2024-prior,
title = "Prior Constraints-based Reward Model Training for Aligning Large Language Models",
author = "Hang, Zhou and
Chenglong, Wang and
Yimin, Hu and
Tong, Xiao and
Chunliang, Zhang and
Jingbo, Zhu",
editor = "Sun, Maosong and
Liang, Jiye and
Han, Xianpei and
Liu, Zhiyuan and
He, Yulan",
booktitle = "Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)",
month = jul,
year = "2024",
address = "Taiyuan, China",
publisher = "Chinese Information Processing Society of China",
url = "https://aclanthology.org/2024.ccl-1.107/",
pages = "1395--1407",
language = "eng",
abstract = "{\textquotedblleft}Reinforcement learning with human feedback for aligning large language models (LLMs) trainsa reward model typically using ranking loss with comparison pairs. However, the training pro-cedure suffers from an inherent problem: the uncontrolled scaling of reward scores during rein-forcement learning due to the lack of constraints while training the reward model. This paperproposes a Prior Constraints-based Reward Model (PCRM) training method to mitigate thisproblem. PCRM incorporates prior constraints{---}specifically, length ratio and cosine similaritybetween outputs of each comparison pair{---}during reward model training to regulate optimiza-tion magnitude and control score margins. We comprehensively evaluate PCRM by examining itsrank correlation with human preferences and its effectiveness in aligning LLMs via RL. Exper-imental results demonstrate that PCRM significantly improves alignment performance by effec-tively constraining reward score scaling. As another bonus, our method is easily integrated intoarbitrary rank-based alignment methods, such as direct preference optimization, and can yieldconsistent improvement. The code is available at https://github.com/wangclnlp/DeepSpeed-Chat-Extension/tree/PCRM.{\textquotedblright}"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hang-etal-2024-prior">
<titleInfo>
<title>Prior Constraints-based Reward Model Training for Aligning Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhou</namePart>
<namePart type="family">Hang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wang</namePart>
<namePart type="family">Chenglong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hu</namePart>
<namePart type="family">Yimin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiao</namePart>
<namePart type="family">Tong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhang</namePart>
<namePart type="family">Chunliang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhu</namePart>
<namePart type="family">Jingbo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maosong</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiye</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xianpei</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyuan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Chinese Information Processing Society of China</publisher>
<place>
<placeTerm type="text">Taiyuan, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>“Reinforcement learning with human feedback for aligning large language models (LLMs) trainsa reward model typically using ranking loss with comparison pairs. However, the training pro-cedure suffers from an inherent problem: the uncontrolled scaling of reward scores during rein-forcement learning due to the lack of constraints while training the reward model. This paperproposes a Prior Constraints-based Reward Model (PCRM) training method to mitigate thisproblem. PCRM incorporates prior constraints—specifically, length ratio and cosine similaritybetween outputs of each comparison pair—during reward model training to regulate optimiza-tion magnitude and control score margins. We comprehensively evaluate PCRM by examining itsrank correlation with human preferences and its effectiveness in aligning LLMs via RL. Exper-imental results demonstrate that PCRM significantly improves alignment performance by effec-tively constraining reward score scaling. As another bonus, our method is easily integrated intoarbitrary rank-based alignment methods, such as direct preference optimization, and can yieldconsistent improvement. The code is available at https://github.com/wangclnlp/DeepSpeed-Chat-Extension/tree/PCRM.”</abstract>
<identifier type="citekey">hang-etal-2024-prior</identifier>
<location>
<url>https://aclanthology.org/2024.ccl-1.107/</url>
</location>
<part>
<date>2024-07</date>
<extent unit="page">
<start>1395</start>
<end>1407</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Prior Constraints-based Reward Model Training for Aligning Large Language Models
%A Hang, Zhou
%A Chenglong, Wang
%A Yimin, Hu
%A Tong, Xiao
%A Chunliang, Zhang
%A Jingbo, Zhu
%Y Sun, Maosong
%Y Liang, Jiye
%Y Han, Xianpei
%Y Liu, Zhiyuan
%Y He, Yulan
%S Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)
%D 2024
%8 July
%I Chinese Information Processing Society of China
%C Taiyuan, China
%G eng
%F hang-etal-2024-prior
%X “Reinforcement learning with human feedback for aligning large language models (LLMs) trainsa reward model typically using ranking loss with comparison pairs. However, the training pro-cedure suffers from an inherent problem: the uncontrolled scaling of reward scores during rein-forcement learning due to the lack of constraints while training the reward model. This paperproposes a Prior Constraints-based Reward Model (PCRM) training method to mitigate thisproblem. PCRM incorporates prior constraints—specifically, length ratio and cosine similaritybetween outputs of each comparison pair—during reward model training to regulate optimiza-tion magnitude and control score margins. We comprehensively evaluate PCRM by examining itsrank correlation with human preferences and its effectiveness in aligning LLMs via RL. Exper-imental results demonstrate that PCRM significantly improves alignment performance by effec-tively constraining reward score scaling. As another bonus, our method is easily integrated intoarbitrary rank-based alignment methods, such as direct preference optimization, and can yieldconsistent improvement. The code is available at https://github.com/wangclnlp/DeepSpeed-Chat-Extension/tree/PCRM.”
%U https://aclanthology.org/2024.ccl-1.107/
%P 1395-1407
Markdown (Informal)
[Prior Constraints-based Reward Model Training for Aligning Large Language Models](https://aclanthology.org/2024.ccl-1.107/) (Hang et al., CCL 2024)
ACL