@inproceedings{chen-etal-2026-apex,
title = "{APEX}: Learning Adaptive Priorities for Multi-Objective Alignment in Vision-Language Generation",
author = "Chen, Dongliang and
Zhuang, Xinlin and
Xu, Junjie and
Xie, Luojian and
Wang, Zehui and
Zhuang, Jiaxi and
Yang, Haolin and
Dou, Liang and
He, Xiao and
Wu, Xingjiao and
Qian, Ying",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.243/",
pages = "4922--4939",
ISBN = "979-8-89176-395-1",
abstract = "Multi-objective alignment for text-to-image generation is commonly implemented via static linear scalarization, but \textit{fixed} weights often fail under heterogeneous rewards, leading to optimization imbalance where models overfit high-variance, high-responsiveness objectives (e.g., OCR) while under-optimizing perceptual goals. We identify two mechanistic causes: \textbf{variance hijacking}, where reward dispersion induces implicit reweighting that dominates the normalized training signal, and \textbf{gradient conflicts}, where competing objectives produce opposing update directions and trigger seesaw-like oscillations. We propose \textbf{APEX} (\textbf{A}daptive \textbf{P}riority-based \textbf{E}fficient \textbf{X}-objective Alignment), which stabilizes heterogeneous rewards with \textbf{Dual-Stage Adaptive Normalization} and dynamically schedules objectives via \textbf{$\mathcal{P}^3$ Adaptive Priorities} that combine learning potential, conflict penalty, and progress need. On Stable Diffusion 3.5, APEX achieves improved Pareto trade-offs across four heterogeneous objectives, with balanced gains of \textbf{+1.31 PickScore}, \textbf{+0.35 DeQA}, and \textbf{+0.53 Aesthetics} while maintaining competitive OCR accuracy, mitigating the instability of multi-objective alignment."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2026-apex">
<titleInfo>
<title>APEX: Learning Adaptive Priorities for Multi-Objective Alignment in Vision-Language Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dongliang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinlin</namePart>
<namePart type="family">Zhuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junjie</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luojian</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zehui</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaxi</namePart>
<namePart type="family">Zhuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haolin</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liang</namePart>
<namePart type="family">Dou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiao</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xingjiao</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ying</namePart>
<namePart type="family">Qian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Multi-objective alignment for text-to-image generation is commonly implemented via static linear scalarization, but fixed weights often fail under heterogeneous rewards, leading to optimization imbalance where models overfit high-variance, high-responsiveness objectives (e.g., OCR) while under-optimizing perceptual goals. We identify two mechanistic causes: variance hijacking, where reward dispersion induces implicit reweighting that dominates the normalized training signal, and gradient conflicts, where competing objectives produce opposing update directions and trigger seesaw-like oscillations. We propose APEX (Adaptive Priority-based Efficient X-objective Alignment), which stabilizes heterogeneous rewards with Dual-Stage Adaptive Normalization and dynamically schedules objectives via \mathcalP³ Adaptive Priorities that combine learning potential, conflict penalty, and progress need. On Stable Diffusion 3.5, APEX achieves improved Pareto trade-offs across four heterogeneous objectives, with balanced gains of +1.31 PickScore, +0.35 DeQA, and +0.53 Aesthetics while maintaining competitive OCR accuracy, mitigating the instability of multi-objective alignment.</abstract>
<identifier type="citekey">chen-etal-2026-apex</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.243/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>4922</start>
<end>4939</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T APEX: Learning Adaptive Priorities for Multi-Objective Alignment in Vision-Language Generation
%A Chen, Dongliang
%A Zhuang, Xinlin
%A Xu, Junjie
%A Xie, Luojian
%A Wang, Zehui
%A Zhuang, Jiaxi
%A Yang, Haolin
%A Dou, Liang
%A He, Xiao
%A Wu, Xingjiao
%A Qian, Ying
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F chen-etal-2026-apex
%X Multi-objective alignment for text-to-image generation is commonly implemented via static linear scalarization, but fixed weights often fail under heterogeneous rewards, leading to optimization imbalance where models overfit high-variance, high-responsiveness objectives (e.g., OCR) while under-optimizing perceptual goals. We identify two mechanistic causes: variance hijacking, where reward dispersion induces implicit reweighting that dominates the normalized training signal, and gradient conflicts, where competing objectives produce opposing update directions and trigger seesaw-like oscillations. We propose APEX (Adaptive Priority-based Efficient X-objective Alignment), which stabilizes heterogeneous rewards with Dual-Stage Adaptive Normalization and dynamically schedules objectives via \mathcalP³ Adaptive Priorities that combine learning potential, conflict penalty, and progress need. On Stable Diffusion 3.5, APEX achieves improved Pareto trade-offs across four heterogeneous objectives, with balanced gains of +1.31 PickScore, +0.35 DeQA, and +0.53 Aesthetics while maintaining competitive OCR accuracy, mitigating the instability of multi-objective alignment.
%U https://aclanthology.org/2026.findings-acl.243/
%P 4922-4939
Markdown (Informal)
[APEX: Learning Adaptive Priorities for Multi-Objective Alignment in Vision-Language Generation](https://aclanthology.org/2026.findings-acl.243/) (Chen et al., Findings 2026)
ACL
- Dongliang Chen, Xinlin Zhuang, Junjie Xu, Luojian Xie, Zehui Wang, Jiaxi Zhuang, Haolin Yang, Liang Dou, Xiao He, Xingjiao Wu, and Ying Qian. 2026. APEX: Learning Adaptive Priorities for Multi-Objective Alignment in Vision-Language Generation. In Findings of the Association for Computational Linguistics: ACL 2026, pages 4922–4939, San Diego, California, United States. Association for Computational Linguistics.