@inproceedings{xu-etal-2026-understanding,
title = "Understanding Conflicts in Multi-Objective Alignment through Reward Consistency",
author = "Xu, Zhihao and
Tong, Yongqi and
Zhang, Xin and
Zhou, Jun and
Wang, Xiting",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.269/",
pages = "5450--5472",
ISBN = "979-8-89176-395-1",
abstract = "Multi-objective preference alignment often faces alignment conflicts, where optimizing for one objective (e.g., helpfulness) degrades performance on others (e.g., harmlessness). While prior work focuses on algorithmic solutions, the intrinsic conflict within data and its theoretical impact on training remain underexplored. To bridge this gap, we introduce the principle of Reward Consistency (RC), a theory-grounded criterion that approximates the alignment conflicts via reward models. We prove that a sample mitigates conflicts if and only if it satisfies RC, thereby ensuring improvement across all objectives during optimization. Building on this, we propose Reward Consistency Sampling (RCS), an automated framework for constructing pairwise data that adheres to RC, supplemented by a relaxation strategy to enhance flexibility. Extensive experiments show that RCS brings significant and consistent performance gains, achieving an average improvement of 23.07{\%} in both harmlessness and helpfulness during simultaneous optimization comparde to the vanilla dataset. Our data-centric approach is complementary to existing alignment algorithms and effective in both sequential and simultaneous optimization scenarios."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xu-etal-2026-understanding">
<titleInfo>
<title>Understanding Conflicts in Multi-Objective Alignment through Reward Consistency</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhihao</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yongqi</namePart>
<namePart type="family">Tong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiting</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Multi-objective preference alignment often faces alignment conflicts, where optimizing for one objective (e.g., helpfulness) degrades performance on others (e.g., harmlessness). While prior work focuses on algorithmic solutions, the intrinsic conflict within data and its theoretical impact on training remain underexplored. To bridge this gap, we introduce the principle of Reward Consistency (RC), a theory-grounded criterion that approximates the alignment conflicts via reward models. We prove that a sample mitigates conflicts if and only if it satisfies RC, thereby ensuring improvement across all objectives during optimization. Building on this, we propose Reward Consistency Sampling (RCS), an automated framework for constructing pairwise data that adheres to RC, supplemented by a relaxation strategy to enhance flexibility. Extensive experiments show that RCS brings significant and consistent performance gains, achieving an average improvement of 23.07% in both harmlessness and helpfulness during simultaneous optimization comparde to the vanilla dataset. Our data-centric approach is complementary to existing alignment algorithms and effective in both sequential and simultaneous optimization scenarios.</abstract>
<identifier type="citekey">xu-etal-2026-understanding</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.269/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>5450</start>
<end>5472</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Understanding Conflicts in Multi-Objective Alignment through Reward Consistency
%A Xu, Zhihao
%A Tong, Yongqi
%A Zhang, Xin
%A Zhou, Jun
%A Wang, Xiting
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F xu-etal-2026-understanding
%X Multi-objective preference alignment often faces alignment conflicts, where optimizing for one objective (e.g., helpfulness) degrades performance on others (e.g., harmlessness). While prior work focuses on algorithmic solutions, the intrinsic conflict within data and its theoretical impact on training remain underexplored. To bridge this gap, we introduce the principle of Reward Consistency (RC), a theory-grounded criterion that approximates the alignment conflicts via reward models. We prove that a sample mitigates conflicts if and only if it satisfies RC, thereby ensuring improvement across all objectives during optimization. Building on this, we propose Reward Consistency Sampling (RCS), an automated framework for constructing pairwise data that adheres to RC, supplemented by a relaxation strategy to enhance flexibility. Extensive experiments show that RCS brings significant and consistent performance gains, achieving an average improvement of 23.07% in both harmlessness and helpfulness during simultaneous optimization comparde to the vanilla dataset. Our data-centric approach is complementary to existing alignment algorithms and effective in both sequential and simultaneous optimization scenarios.
%U https://aclanthology.org/2026.findings-acl.269/
%P 5450-5472
Markdown (Informal)
[Understanding Conflicts in Multi-Objective Alignment through Reward Consistency](https://aclanthology.org/2026.findings-acl.269/) (Xu et al., Findings 2026)
ACL