@inproceedings{le-etal-2026-causal,
title = "Causal Direct Preference Optimization for Language Model Alignment",
author = "Le, Uyen and
Nguyen, Thin and
Nguyen, Toan and
Doan, Toan and
Le, Trung and
Le, Bac",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.58/",
pages = "1098--1113",
ISBN = "979-8-89176-386-9",
abstract = "Direct Preference Optimization (DPO) is a powerful approach for aligning large language models (LLMs) with human preferences by formulating preference learning as a supervised classification problem over pairwise human-labeled outputs, thereby enabling stable and efficient training. We show that DPO inherits bias from confounders (e.g., topic, style, user objectives) that shape data generation and carry through to training, hindering recovery of true human preferences. We address this from a causal perspective, proposing Causal Direct Preference Optimization (CDPO), a general framework that incorporates causal inference principles to mitigate the influence of confounders and sharpen the signal of genuine human preferences. Our approach preserves the tractability of direct optimization while enhancing robustness to spurious correlations and annotation biases. Empirical evaluations on benchmark datasets show that CDPO surpasses DPO-based baselines by achieving unbiased fine-tuning through causal reasoning, confirming the effectiveness of confounder-aware preference optimization."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="le-etal-2026-causal">
<titleInfo>
<title>Causal Direct Preference Optimization for Language Model Alignment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Uyen</namePart>
<namePart type="family">Le</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thin</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Toan</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Toan</namePart>
<namePart type="family">Doan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trung</namePart>
<namePart type="family">Le</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bac</namePart>
<namePart type="family">Le</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>Direct Preference Optimization (DPO) is a powerful approach for aligning large language models (LLMs) with human preferences by formulating preference learning as a supervised classification problem over pairwise human-labeled outputs, thereby enabling stable and efficient training. We show that DPO inherits bias from confounders (e.g., topic, style, user objectives) that shape data generation and carry through to training, hindering recovery of true human preferences. We address this from a causal perspective, proposing Causal Direct Preference Optimization (CDPO), a general framework that incorporates causal inference principles to mitigate the influence of confounders and sharpen the signal of genuine human preferences. Our approach preserves the tractability of direct optimization while enhancing robustness to spurious correlations and annotation biases. Empirical evaluations on benchmark datasets show that CDPO surpasses DPO-based baselines by achieving unbiased fine-tuning through causal reasoning, confirming the effectiveness of confounder-aware preference optimization.</abstract>
<identifier type="citekey">le-etal-2026-causal</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.58/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>1098</start>
<end>1113</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Causal Direct Preference Optimization for Language Model Alignment
%A Le, Uyen
%A Nguyen, Thin
%A Nguyen, Toan
%A Doan, Toan
%A Le, Trung
%A Le, Bac
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F le-etal-2026-causal
%X Direct Preference Optimization (DPO) is a powerful approach for aligning large language models (LLMs) with human preferences by formulating preference learning as a supervised classification problem over pairwise human-labeled outputs, thereby enabling stable and efficient training. We show that DPO inherits bias from confounders (e.g., topic, style, user objectives) that shape data generation and carry through to training, hindering recovery of true human preferences. We address this from a causal perspective, proposing Causal Direct Preference Optimization (CDPO), a general framework that incorporates causal inference principles to mitigate the influence of confounders and sharpen the signal of genuine human preferences. Our approach preserves the tractability of direct optimization while enhancing robustness to spurious correlations and annotation biases. Empirical evaluations on benchmark datasets show that CDPO surpasses DPO-based baselines by achieving unbiased fine-tuning through causal reasoning, confirming the effectiveness of confounder-aware preference optimization.
%U https://aclanthology.org/2026.findings-eacl.58/
%P 1098-1113
Markdown (Informal)
[Causal Direct Preference Optimization for Language Model Alignment](https://aclanthology.org/2026.findings-eacl.58/) (Le et al., Findings 2026)
ACL