@inproceedings{li-etal-2026-learning-temporally,
title = "Learning Temporally-Aware Sample Weights for Preference Optimization",
author = "Li, Mengyang and
Zhou, Xudong and
Zhao, Pinlong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.601/",
pages = "12361--12377",
ISBN = "979-8-89176-395-1",
abstract = "Preference optimization is fundamental for aligning large language models. While existing methods use sample weighting, they typically rely on static functions of instantaneous model states and ignore temporal learning dynamics. We contend that a sample{'}s value evolves throughout training, characterized by patterns such as stable convergence or noisy oscillation. We propose MetaPO, a framework that meta-learns adaptive weights using three temporal features: reward margin evolution, learning volatility, and reference deviation. Through bilevel optimization on validation data, MetaPO automatically discovers weighting strategies tailored to specific datasets. Experiments on models ranging from 7B to 70B parameters demonstrate statistically significant improvements over strong baselines, achieving gains of up to 2.4 points on AlpacaEval 2.0 and Arena-Hard. Interpretability analysis confirms that temporal features drive over 70{\%} of the weighting decisions and that the learned weights correlate strongly with sample quality."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2026-learning-temporally">
<titleInfo>
<title>Learning Temporally-Aware Sample Weights for Preference Optimization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mengyang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xudong</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pinlong</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Preference optimization is fundamental for aligning large language models. While existing methods use sample weighting, they typically rely on static functions of instantaneous model states and ignore temporal learning dynamics. We contend that a sample’s value evolves throughout training, characterized by patterns such as stable convergence or noisy oscillation. We propose MetaPO, a framework that meta-learns adaptive weights using three temporal features: reward margin evolution, learning volatility, and reference deviation. Through bilevel optimization on validation data, MetaPO automatically discovers weighting strategies tailored to specific datasets. Experiments on models ranging from 7B to 70B parameters demonstrate statistically significant improvements over strong baselines, achieving gains of up to 2.4 points on AlpacaEval 2.0 and Arena-Hard. Interpretability analysis confirms that temporal features drive over 70% of the weighting decisions and that the learned weights correlate strongly with sample quality.</abstract>
<identifier type="citekey">li-etal-2026-learning-temporally</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.601/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>12361</start>
<end>12377</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning Temporally-Aware Sample Weights for Preference Optimization
%A Li, Mengyang
%A Zhou, Xudong
%A Zhao, Pinlong
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F li-etal-2026-learning-temporally
%X Preference optimization is fundamental for aligning large language models. While existing methods use sample weighting, they typically rely on static functions of instantaneous model states and ignore temporal learning dynamics. We contend that a sample’s value evolves throughout training, characterized by patterns such as stable convergence or noisy oscillation. We propose MetaPO, a framework that meta-learns adaptive weights using three temporal features: reward margin evolution, learning volatility, and reference deviation. Through bilevel optimization on validation data, MetaPO automatically discovers weighting strategies tailored to specific datasets. Experiments on models ranging from 7B to 70B parameters demonstrate statistically significant improvements over strong baselines, achieving gains of up to 2.4 points on AlpacaEval 2.0 and Arena-Hard. Interpretability analysis confirms that temporal features drive over 70% of the weighting decisions and that the learned weights correlate strongly with sample quality.
%U https://aclanthology.org/2026.findings-acl.601/
%P 12361-12377
Markdown (Informal)
[Learning Temporally-Aware Sample Weights for Preference Optimization](https://aclanthology.org/2026.findings-acl.601/) (Li et al., Findings 2026)
ACL