@inproceedings{bansal-etal-2025-comparing,
title = "Comparing Bad Apples to Good Oranges Aligning Large Language Models via Joint Preference Optimization",
author = "Bansal, Hritik and
Suvarna, Ashima and
Bhatt, Gantavya and
Peng, Nanyun and
Chang, Kai-Wei and
Grover, Aditya",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.39/",
doi = "10.18653/v1/2025.findings-acl.39",
pages = "701--723",
ISBN = "979-8-89176-256-5",
abstract = "A common technique for aligning large language models (LLMs) relies on acquiring human preferences by comparing multiple generations conditioned on a fixed context. This method, however, relies solely on pairwise comparisons, where the generations are evaluated within an identical context. While effective to such conditional preferences often fail to encompass the nuanced and multidimensional nature of human preferences. In this work, we revisit the traditional paradigm of preference acquisition and propose a new axis based on eliciting preferences jointly over the instruction-response pairs. Unlike prior preference optimizations, which are designed for conditional ranking protocols (e.g., DPO), we propose Joint Preference Optimization (JPO), a new preference optimization objective that upweights the joint probability of the chosen instruction-response pair over the rejected instruction-response pair. Interestingly, LLMs trained with joint instruction-response preference data using JPO outperform LLM trained with DPO by 5.2{\%} and 3.3{\%} win-rate for summarization and open-ended dialogue datasets, respectively. Our findings reveal that joint preferences over instruction and response pairs can significantly enhance the alignment of LLMs by tapping into a broader spectrum of human preference elicitation. The data and code is available athttps://github.com/Hritikbansal/jpo."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bansal-etal-2025-comparing">
<titleInfo>
<title>Comparing Bad Apples to Good Oranges Aligning Large Language Models via Joint Preference Optimization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hritik</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashima</namePart>
<namePart type="family">Suvarna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gantavya</namePart>
<namePart type="family">Bhatt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nanyun</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aditya</namePart>
<namePart type="family">Grover</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>A common technique for aligning large language models (LLMs) relies on acquiring human preferences by comparing multiple generations conditioned on a fixed context. This method, however, relies solely on pairwise comparisons, where the generations are evaluated within an identical context. While effective to such conditional preferences often fail to encompass the nuanced and multidimensional nature of human preferences. In this work, we revisit the traditional paradigm of preference acquisition and propose a new axis based on eliciting preferences jointly over the instruction-response pairs. Unlike prior preference optimizations, which are designed for conditional ranking protocols (e.g., DPO), we propose Joint Preference Optimization (JPO), a new preference optimization objective that upweights the joint probability of the chosen instruction-response pair over the rejected instruction-response pair. Interestingly, LLMs trained with joint instruction-response preference data using JPO outperform LLM trained with DPO by 5.2% and 3.3% win-rate for summarization and open-ended dialogue datasets, respectively. Our findings reveal that joint preferences over instruction and response pairs can significantly enhance the alignment of LLMs by tapping into a broader spectrum of human preference elicitation. The data and code is available athttps://github.com/Hritikbansal/jpo.</abstract>
<identifier type="citekey">bansal-etal-2025-comparing</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.39</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.39/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>701</start>
<end>723</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Comparing Bad Apples to Good Oranges Aligning Large Language Models via Joint Preference Optimization
%A Bansal, Hritik
%A Suvarna, Ashima
%A Bhatt, Gantavya
%A Peng, Nanyun
%A Chang, Kai-Wei
%A Grover, Aditya
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F bansal-etal-2025-comparing
%X A common technique for aligning large language models (LLMs) relies on acquiring human preferences by comparing multiple generations conditioned on a fixed context. This method, however, relies solely on pairwise comparisons, where the generations are evaluated within an identical context. While effective to such conditional preferences often fail to encompass the nuanced and multidimensional nature of human preferences. In this work, we revisit the traditional paradigm of preference acquisition and propose a new axis based on eliciting preferences jointly over the instruction-response pairs. Unlike prior preference optimizations, which are designed for conditional ranking protocols (e.g., DPO), we propose Joint Preference Optimization (JPO), a new preference optimization objective that upweights the joint probability of the chosen instruction-response pair over the rejected instruction-response pair. Interestingly, LLMs trained with joint instruction-response preference data using JPO outperform LLM trained with DPO by 5.2% and 3.3% win-rate for summarization and open-ended dialogue datasets, respectively. Our findings reveal that joint preferences over instruction and response pairs can significantly enhance the alignment of LLMs by tapping into a broader spectrum of human preference elicitation. The data and code is available athttps://github.com/Hritikbansal/jpo.
%R 10.18653/v1/2025.findings-acl.39
%U https://aclanthology.org/2025.findings-acl.39/
%U https://doi.org/10.18653/v1/2025.findings-acl.39
%P 701-723
Markdown (Informal)
[Comparing Bad Apples to Good Oranges Aligning Large Language Models via Joint Preference Optimization](https://aclanthology.org/2025.findings-acl.39/) (Bansal et al., Findings 2025)
ACL