@inproceedings{lee-lim-2024-towards,
title = "Towards {P}areto-Efficient {RLHF}: Paying Attention to a Few High-Reward Samples with Reward Dropout",
author = "Lee, Changhun and
Lim, Chiehyeon",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.489",
pages = "8335--8349",
abstract = "Recently, leveraging reinforcement learning (RL) to fine-tune language models (LMs), known as reinforcement learning from human feedback (RLHF), has become an important research topic. However, there is still a lack of theoretical understanding of how RLHF works, the conditions under which it succeeds or fails, and whether it guarantees optimization of both likelihood $\beta(\cdot)$ and reward $R(\cdot)$ objectives. To address these issues, we consider RLHF as a bi-objective problem that has the nature of a \textit{Pareto} optimization, present a Pareto improvement condition that is necessary to obtain Pareto-efficient policies, and propose a simple yet powerful method named \textit{reward dropout} that guarantees a Pareto improvement. To demonstrate the performance of reward dropout, two benchmark datasets commonly used in text style transfer tasks were utilized in our study: sentiment and topic datasets sourced from Yelp and AG{\_}News, respectively. Our experiments highlight that paying attention to a few samples with higher rewards leads to greater Pareto improvements regardless of model size. We also demonstrate that the effect of reward dropout is generalizable and most effective with non-pretrained target models, saving the effort of pretraining.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lee-lim-2024-towards">
<titleInfo>
<title>Towards Pareto-Efficient RLHF: Paying Attention to a Few High-Reward Samples with Reward Dropout</title>
</titleInfo>
<name type="personal">
<namePart type="given">Changhun</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chiehyeon</namePart>
<namePart type="family">Lim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recently, leveraging reinforcement learning (RL) to fine-tune language models (LMs), known as reinforcement learning from human feedback (RLHF), has become an important research topic. However, there is still a lack of theoretical understanding of how RLHF works, the conditions under which it succeeds or fails, and whether it guarantees optimization of both likelihood β(·) and reward R(·) objectives. To address these issues, we consider RLHF as a bi-objective problem that has the nature of a Pareto optimization, present a Pareto improvement condition that is necessary to obtain Pareto-efficient policies, and propose a simple yet powerful method named reward dropout that guarantees a Pareto improvement. To demonstrate the performance of reward dropout, two benchmark datasets commonly used in text style transfer tasks were utilized in our study: sentiment and topic datasets sourced from Yelp and AG_News, respectively. Our experiments highlight that paying attention to a few samples with higher rewards leads to greater Pareto improvements regardless of model size. We also demonstrate that the effect of reward dropout is generalizable and most effective with non-pretrained target models, saving the effort of pretraining.</abstract>
<identifier type="citekey">lee-lim-2024-towards</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.489</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>8335</start>
<end>8349</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Pareto-Efficient RLHF: Paying Attention to a Few High-Reward Samples with Reward Dropout
%A Lee, Changhun
%A Lim, Chiehyeon
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F lee-lim-2024-towards
%X Recently, leveraging reinforcement learning (RL) to fine-tune language models (LMs), known as reinforcement learning from human feedback (RLHF), has become an important research topic. However, there is still a lack of theoretical understanding of how RLHF works, the conditions under which it succeeds or fails, and whether it guarantees optimization of both likelihood β(·) and reward R(·) objectives. To address these issues, we consider RLHF as a bi-objective problem that has the nature of a Pareto optimization, present a Pareto improvement condition that is necessary to obtain Pareto-efficient policies, and propose a simple yet powerful method named reward dropout that guarantees a Pareto improvement. To demonstrate the performance of reward dropout, two benchmark datasets commonly used in text style transfer tasks were utilized in our study: sentiment and topic datasets sourced from Yelp and AG_News, respectively. Our experiments highlight that paying attention to a few samples with higher rewards leads to greater Pareto improvements regardless of model size. We also demonstrate that the effect of reward dropout is generalizable and most effective with non-pretrained target models, saving the effort of pretraining.
%U https://aclanthology.org/2024.findings-emnlp.489
%P 8335-8349
Markdown (Informal)
[Towards Pareto-Efficient RLHF: Paying Attention to a Few High-Reward Samples with Reward Dropout](https://aclanthology.org/2024.findings-emnlp.489) (Lee & Lim, Findings 2024)
ACL