@inproceedings{mao-etal-2024-dont,
title = "Don`t Forget Your Reward Values: Language Model Alignment via Value-based Calibration",
author = "Mao, Xin and
Li, Feng-Lin and
Xu, Huimin and
Zhang, Wei and
Chen, Wang and
Luu, Anh Tuan",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.976/",
doi = "10.18653/v1/2024.emnlp-main.976",
pages = "17622--17642",
abstract = "While Reinforcement Learning from Human Feedback (RLHF) significantly enhances the generation quality of Large Language Models (LLMs), recent studies have raised concerns regarding the complexity and instability associated with the Proximal Policy Optimization (PPO) algorithm, proposing a series of order-based alignment methods as viable alternatives. This paper delves into existing order-based methods, unifying them into one framework and examining their inefficiencies in utilizing reward values. Building upon these findings, we propose a new Value-based Calibration (VCB) method to better align LLMs with human preferences. Experimental results demonstrate that VCB surpasses existing alignment methods on AI assistant and summarization datasets, providing impressive generalizability, robustness, and diversity in different settings."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mao-etal-2024-dont">
<titleInfo>
<title>Don‘t Forget Your Reward Values: Language Model Alignment via Value-based Calibration</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="family">Mao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Feng-Lin</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huimin</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anh</namePart>
<namePart type="given">Tuan</namePart>
<namePart type="family">Luu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While Reinforcement Learning from Human Feedback (RLHF) significantly enhances the generation quality of Large Language Models (LLMs), recent studies have raised concerns regarding the complexity and instability associated with the Proximal Policy Optimization (PPO) algorithm, proposing a series of order-based alignment methods as viable alternatives. This paper delves into existing order-based methods, unifying them into one framework and examining their inefficiencies in utilizing reward values. Building upon these findings, we propose a new Value-based Calibration (VCB) method to better align LLMs with human preferences. Experimental results demonstrate that VCB surpasses existing alignment methods on AI assistant and summarization datasets, providing impressive generalizability, robustness, and diversity in different settings.</abstract>
<identifier type="citekey">mao-etal-2024-dont</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.976</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.976/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>17622</start>
<end>17642</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Don‘t Forget Your Reward Values: Language Model Alignment via Value-based Calibration
%A Mao, Xin
%A Li, Feng-Lin
%A Xu, Huimin
%A Zhang, Wei
%A Chen, Wang
%A Luu, Anh Tuan
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F mao-etal-2024-dont
%X While Reinforcement Learning from Human Feedback (RLHF) significantly enhances the generation quality of Large Language Models (LLMs), recent studies have raised concerns regarding the complexity and instability associated with the Proximal Policy Optimization (PPO) algorithm, proposing a series of order-based alignment methods as viable alternatives. This paper delves into existing order-based methods, unifying them into one framework and examining their inefficiencies in utilizing reward values. Building upon these findings, we propose a new Value-based Calibration (VCB) method to better align LLMs with human preferences. Experimental results demonstrate that VCB surpasses existing alignment methods on AI assistant and summarization datasets, providing impressive generalizability, robustness, and diversity in different settings.
%R 10.18653/v1/2024.emnlp-main.976
%U https://aclanthology.org/2024.emnlp-main.976/
%U https://doi.org/10.18653/v1/2024.emnlp-main.976
%P 17622-17642
Markdown (Informal)
[Don’t Forget Your Reward Values: Language Model Alignment via Value-based Calibration](https://aclanthology.org/2024.emnlp-main.976/) (Mao et al., EMNLP 2024)
ACL