@inproceedings{nath-etal-2025-dpl,
title = "{DPL}: Diverse Preference Learning Without A Reference Model",
author = "Nath, Abhijnan and
Volozin, Andrey and
Saha, Saumajit and
Nanda, Albert Aristotle and
Grunin, Galina and
Bhotika, Rahul and
Krishnaswamy, Nikhil",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.190/",
doi = "10.18653/v1/2025.naacl-long.190",
pages = "3727--3747",
ISBN = "979-8-89176-189-6",
abstract = "In direct preference alignment in LLMs, most existing methods seek to retrieve the reward function directly from preference data. However, real-world preference data often contains diversity in preference annotations reflective of true human preferences. Existing algorithms, including KTO, do not directly utilize such nuances in the annotations which limits their applicability. In this work, we propose Diverse Preference Learning (DPL), a reference model-free method that simultaneously learns a baseline desirability in LLM responses while being robust to the diversity of preference annotations. Our experiments for instruction-following on Ultrafeedback and AlpacaEval 2.0 and for text-summarization on Reddit TL;DR suggest that DPL is consistently better at learning the diversity of preferences compared to existing methods, including those that require a reference model in memory. Apart from overall quality, we find that DPL{'}s completions, on average, are more honest, helpful, truthful and safe compared to existing methods."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nath-etal-2025-dpl">
<titleInfo>
<title>DPL: Diverse Preference Learning Without A Reference Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Abhijnan</namePart>
<namePart type="family">Nath</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrey</namePart>
<namePart type="family">Volozin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saumajit</namePart>
<namePart type="family">Saha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Albert</namePart>
<namePart type="given">Aristotle</namePart>
<namePart type="family">Nanda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galina</namePart>
<namePart type="family">Grunin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Bhotika</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikhil</namePart>
<namePart type="family">Krishnaswamy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>In direct preference alignment in LLMs, most existing methods seek to retrieve the reward function directly from preference data. However, real-world preference data often contains diversity in preference annotations reflective of true human preferences. Existing algorithms, including KTO, do not directly utilize such nuances in the annotations which limits their applicability. In this work, we propose Diverse Preference Learning (DPL), a reference model-free method that simultaneously learns a baseline desirability in LLM responses while being robust to the diversity of preference annotations. Our experiments for instruction-following on Ultrafeedback and AlpacaEval 2.0 and for text-summarization on Reddit TL;DR suggest that DPL is consistently better at learning the diversity of preferences compared to existing methods, including those that require a reference model in memory. Apart from overall quality, we find that DPL’s completions, on average, are more honest, helpful, truthful and safe compared to existing methods.</abstract>
<identifier type="citekey">nath-etal-2025-dpl</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.190</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.190/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>3727</start>
<end>3747</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DPL: Diverse Preference Learning Without A Reference Model
%A Nath, Abhijnan
%A Volozin, Andrey
%A Saha, Saumajit
%A Nanda, Albert Aristotle
%A Grunin, Galina
%A Bhotika, Rahul
%A Krishnaswamy, Nikhil
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F nath-etal-2025-dpl
%X In direct preference alignment in LLMs, most existing methods seek to retrieve the reward function directly from preference data. However, real-world preference data often contains diversity in preference annotations reflective of true human preferences. Existing algorithms, including KTO, do not directly utilize such nuances in the annotations which limits their applicability. In this work, we propose Diverse Preference Learning (DPL), a reference model-free method that simultaneously learns a baseline desirability in LLM responses while being robust to the diversity of preference annotations. Our experiments for instruction-following on Ultrafeedback and AlpacaEval 2.0 and for text-summarization on Reddit TL;DR suggest that DPL is consistently better at learning the diversity of preferences compared to existing methods, including those that require a reference model in memory. Apart from overall quality, we find that DPL’s completions, on average, are more honest, helpful, truthful and safe compared to existing methods.
%R 10.18653/v1/2025.naacl-long.190
%U https://aclanthology.org/2025.naacl-long.190/
%U https://doi.org/10.18653/v1/2025.naacl-long.190
%P 3727-3747
Markdown (Informal)
[DPL: Diverse Preference Learning Without A Reference Model](https://aclanthology.org/2025.naacl-long.190/) (Nath et al., NAACL 2025)
ACL
- Abhijnan Nath, Andrey Volozin, Saumajit Saha, Albert Aristotle Nanda, Galina Grunin, Rahul Bhotika, and Nikhil Krishnaswamy. 2025. DPL: Diverse Preference Learning Without A Reference Model. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 3727–3747, Albuquerque, New Mexico. Association for Computational Linguistics.