@inproceedings{chen-etal-2024-self-evolution,
title = "Self-Evolution Fine-Tuning for Policy Optimization",
author = "Chen, Ruijun and
Liang, Jiehao and
Gao, Shiping and
Wan, Fanqi and
Quan, Xiaojun",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.238",
pages = "4120--4137",
abstract = "The alignment of large language models (LLMs) is crucial not only for unlocking their potential in specific tasks but also for ensuring that responses meet human expectations and adhere to safety and ethical principles. To address the challenges of current alignment methodologies, we introduce self-evolution fine-tuning (SEFT) for LLM alignment, aiming to eliminate the need for annotated samples while retaining the stability and efficiency of SFT. SEFT first trains an adaptive reviser to elevate low-quality responses while maintaining high-quality ones. The reviser then gradually guides the policy{'}s optimization by fine-tuning it with enhanced responses. The method excels in utilizing unlimited unannotated data to optimize policies via supervised fine-tuning. Our experiments on AlpacaEval and MT-Bench demonstrate the effectiveness of SEFT and its advantages over existing alignment techniques.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2024-self-evolution">
<titleInfo>
<title>Self-Evolution Fine-Tuning for Policy Optimization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruijun</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiehao</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shiping</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fanqi</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaojun</namePart>
<namePart type="family">Quan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The alignment of large language models (LLMs) is crucial not only for unlocking their potential in specific tasks but also for ensuring that responses meet human expectations and adhere to safety and ethical principles. To address the challenges of current alignment methodologies, we introduce self-evolution fine-tuning (SEFT) for LLM alignment, aiming to eliminate the need for annotated samples while retaining the stability and efficiency of SFT. SEFT first trains an adaptive reviser to elevate low-quality responses while maintaining high-quality ones. The reviser then gradually guides the policy’s optimization by fine-tuning it with enhanced responses. The method excels in utilizing unlimited unannotated data to optimize policies via supervised fine-tuning. Our experiments on AlpacaEval and MT-Bench demonstrate the effectiveness of SEFT and its advantages over existing alignment techniques.</abstract>
<identifier type="citekey">chen-etal-2024-self-evolution</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.238</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>4120</start>
<end>4137</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Self-Evolution Fine-Tuning for Policy Optimization
%A Chen, Ruijun
%A Liang, Jiehao
%A Gao, Shiping
%A Wan, Fanqi
%A Quan, Xiaojun
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F chen-etal-2024-self-evolution
%X The alignment of large language models (LLMs) is crucial not only for unlocking their potential in specific tasks but also for ensuring that responses meet human expectations and adhere to safety and ethical principles. To address the challenges of current alignment methodologies, we introduce self-evolution fine-tuning (SEFT) for LLM alignment, aiming to eliminate the need for annotated samples while retaining the stability and efficiency of SFT. SEFT first trains an adaptive reviser to elevate low-quality responses while maintaining high-quality ones. The reviser then gradually guides the policy’s optimization by fine-tuning it with enhanced responses. The method excels in utilizing unlimited unannotated data to optimize policies via supervised fine-tuning. Our experiments on AlpacaEval and MT-Bench demonstrate the effectiveness of SEFT and its advantages over existing alignment techniques.
%U https://aclanthology.org/2024.findings-emnlp.238
%P 4120-4137
Markdown (Informal)
[Self-Evolution Fine-Tuning for Policy Optimization](https://aclanthology.org/2024.findings-emnlp.238) (Chen et al., Findings 2024)
ACL
- Ruijun Chen, Jiehao Liang, Shiping Gao, Fanqi Wan, and Xiaojun Quan. 2024. Self-Evolution Fine-Tuning for Policy Optimization. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 4120–4137, Miami, Florida, USA. Association for Computational Linguistics.