@inproceedings{christopoulou-etal-2025-sparsepo,
title = "{S}parse{PO}: Controlling Preference Alignment of {LLM}s via Sparse Token Masks",
author = "Christopoulou, Fenia and
Cardenas, Ronald and
Lampouras, Gerasimos and
Bou Ammar, Haitham and
Wang, Jun",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1389/",
pages = "25477--25503",
ISBN = "979-8-89176-335-7",
abstract = "Direct alignment algorithms have proven an effective step for aligning language models to human-desired behaviors. Current variants of the Direct Preference Optimization objective have focused on a strict setting where all tokens are contributing signals of KL divergence and rewards to the loss function.However, human preference is not affected equally by each word in a sequence but is often dependent on specific words or phrases, e.g. existence of toxic terms leads to non-preferred responses. Based on this observation, we argue that not all tokens should be weighted equally during PO and propose a flexible objective termed SparsePO, that aims to automatically learn to weight the KL divergence and reward corresponding to each token during PO training. We propose two different variants of weight-masks that can either be derived from the reference model itself or learned on the fly. Notably, our method induces sparsity in the learned masks, allowing the model to learn how to best balance reward and KL divergence contributions at the token level, learning an optimal level of mask sparsity.Extensive experiments illustrate the effectiveness of our approach at aligning to preference proxies, including sentiment control, helpfulness and harmless, and summary quality.Our method obtains $+10\%$ and $+3\%$ win-rate points in summarization and dialogue scenarios, respectively,without compromising the reasoning capabilities of the model, or the relevancy and faithfulness of the summary response."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="christopoulou-etal-2025-sparsepo">
<titleInfo>
<title>SparsePO: Controlling Preference Alignment of LLMs via Sparse Token Masks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fenia</namePart>
<namePart type="family">Christopoulou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ronald</namePart>
<namePart type="family">Cardenas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerasimos</namePart>
<namePart type="family">Lampouras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haitham</namePart>
<namePart type="family">Bou Ammar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Direct alignment algorithms have proven an effective step for aligning language models to human-desired behaviors. Current variants of the Direct Preference Optimization objective have focused on a strict setting where all tokens are contributing signals of KL divergence and rewards to the loss function.However, human preference is not affected equally by each word in a sequence but is often dependent on specific words or phrases, e.g. existence of toxic terms leads to non-preferred responses. Based on this observation, we argue that not all tokens should be weighted equally during PO and propose a flexible objective termed SparsePO, that aims to automatically learn to weight the KL divergence and reward corresponding to each token during PO training. We propose two different variants of weight-masks that can either be derived from the reference model itself or learned on the fly. Notably, our method induces sparsity in the learned masks, allowing the model to learn how to best balance reward and KL divergence contributions at the token level, learning an optimal level of mask sparsity.Extensive experiments illustrate the effectiveness of our approach at aligning to preference proxies, including sentiment control, helpfulness and harmless, and summary quality.Our method obtains +10% and +3% win-rate points in summarization and dialogue scenarios, respectively,without compromising the reasoning capabilities of the model, or the relevancy and faithfulness of the summary response.</abstract>
<identifier type="citekey">christopoulou-etal-2025-sparsepo</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1389/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>25477</start>
<end>25503</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SparsePO: Controlling Preference Alignment of LLMs via Sparse Token Masks
%A Christopoulou, Fenia
%A Cardenas, Ronald
%A Lampouras, Gerasimos
%A Bou Ammar, Haitham
%A Wang, Jun
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F christopoulou-etal-2025-sparsepo
%X Direct alignment algorithms have proven an effective step for aligning language models to human-desired behaviors. Current variants of the Direct Preference Optimization objective have focused on a strict setting where all tokens are contributing signals of KL divergence and rewards to the loss function.However, human preference is not affected equally by each word in a sequence but is often dependent on specific words or phrases, e.g. existence of toxic terms leads to non-preferred responses. Based on this observation, we argue that not all tokens should be weighted equally during PO and propose a flexible objective termed SparsePO, that aims to automatically learn to weight the KL divergence and reward corresponding to each token during PO training. We propose two different variants of weight-masks that can either be derived from the reference model itself or learned on the fly. Notably, our method induces sparsity in the learned masks, allowing the model to learn how to best balance reward and KL divergence contributions at the token level, learning an optimal level of mask sparsity.Extensive experiments illustrate the effectiveness of our approach at aligning to preference proxies, including sentiment control, helpfulness and harmless, and summary quality.Our method obtains +10% and +3% win-rate points in summarization and dialogue scenarios, respectively,without compromising the reasoning capabilities of the model, or the relevancy and faithfulness of the summary response.
%U https://aclanthology.org/2025.findings-emnlp.1389/
%P 25477-25503
Markdown (Informal)
[SparsePO: Controlling Preference Alignment of LLMs via Sparse Token Masks](https://aclanthology.org/2025.findings-emnlp.1389/) (Christopoulou et al., Findings 2025)
ACL