@inproceedings{ye-ng-2024-preference,
title = "Preference-Guided Reflective Sampling for Aligning Language Models",
author = "Ye, Hai and
Ng, Hwee Tou",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1206",
pages = "21646--21668",
abstract = "Iterative data generation and model re-training can effectively align large language models (LLMs) to human preferences. The process of data sampling is crucial, as it significantly influences the success of policy improvement. Repeated random sampling is a widely used method that independently queries the model multiple times to generate outputs. In this work, we propose a more effective sampling method, named Preference-Guided Reflective Sampling (PRS). Unlike random sampling, PRS employs a tree-based generation framework to enable more efficient sampling. It leverages adaptive self-refinement techniques to better explore the sampling space. By specifying user preferences in natural language, PRS can further optimize response generation according to these preferences. As a result, PRS can align models to diverse user preferences. Our experiments demonstrate that PRS generates higher-quality responses with significantly higher rewards. On AlpacaEval and Arena-Hard, PRS substantially outperforms repeated random sampling in best-of-$N$ sampling. Moreover, PRS shows strong performance when applied in iterative offline RL training.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ye-ng-2024-preference">
<titleInfo>
<title>Preference-Guided Reflective Sampling for Aligning Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hai</namePart>
<namePart type="family">Ye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hwee</namePart>
<namePart type="given">Tou</namePart>
<namePart type="family">Ng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Iterative data generation and model re-training can effectively align large language models (LLMs) to human preferences. The process of data sampling is crucial, as it significantly influences the success of policy improvement. Repeated random sampling is a widely used method that independently queries the model multiple times to generate outputs. In this work, we propose a more effective sampling method, named Preference-Guided Reflective Sampling (PRS). Unlike random sampling, PRS employs a tree-based generation framework to enable more efficient sampling. It leverages adaptive self-refinement techniques to better explore the sampling space. By specifying user preferences in natural language, PRS can further optimize response generation according to these preferences. As a result, PRS can align models to diverse user preferences. Our experiments demonstrate that PRS generates higher-quality responses with significantly higher rewards. On AlpacaEval and Arena-Hard, PRS substantially outperforms repeated random sampling in best-of-N sampling. Moreover, PRS shows strong performance when applied in iterative offline RL training.</abstract>
<identifier type="citekey">ye-ng-2024-preference</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1206</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>21646</start>
<end>21668</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Preference-Guided Reflective Sampling for Aligning Language Models
%A Ye, Hai
%A Ng, Hwee Tou
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F ye-ng-2024-preference
%X Iterative data generation and model re-training can effectively align large language models (LLMs) to human preferences. The process of data sampling is crucial, as it significantly influences the success of policy improvement. Repeated random sampling is a widely used method that independently queries the model multiple times to generate outputs. In this work, we propose a more effective sampling method, named Preference-Guided Reflective Sampling (PRS). Unlike random sampling, PRS employs a tree-based generation framework to enable more efficient sampling. It leverages adaptive self-refinement techniques to better explore the sampling space. By specifying user preferences in natural language, PRS can further optimize response generation according to these preferences. As a result, PRS can align models to diverse user preferences. Our experiments demonstrate that PRS generates higher-quality responses with significantly higher rewards. On AlpacaEval and Arena-Hard, PRS substantially outperforms repeated random sampling in best-of-N sampling. Moreover, PRS shows strong performance when applied in iterative offline RL training.
%U https://aclanthology.org/2024.emnlp-main.1206
%P 21646-21668
Markdown (Informal)
[Preference-Guided Reflective Sampling for Aligning Language Models](https://aclanthology.org/2024.emnlp-main.1206) (Ye & Ng, EMNLP 2024)
ACL