@inproceedings{xia-etal-2024-aligning,
title = "Aligning as Debiasing: Causality-Aware Alignment via Reinforcement Learning with Interventional Feedback",
author = "Xia, Yu and
Yu, Tong and
He, Zhankui and
Zhao, Handong and
McAuley, Julian and
Li, Shuai",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-long.262",
doi = "10.18653/v1/2024.naacl-long.262",
pages = "4684--4695",
abstract = "Large language models (LLMs) often generate biased outputs containing offensive, toxic, or stereotypical text. Existing LLM alignment methods such as reinforcement learning from human feedback (RLHF) alleviate biases primarily based on reward signals from current model outputs without considering the source of biases. In this work, to explore how biases are formed, we revisit LLMs{'} text generation from a causal perspective. We identify pretraining data and input prompts, which contain semantic correlations of textual phrases, as two confounders between LLMs and model outputs causing biases. Inspired by our causal view, we leverage the reward model in RL alignment as an instrumental variable to perform causal intervention on LLMs. Utilizing the reward difference between an initial LLM and intervened LLM as interventional feedback to guide RL finetuning, we propose Causality-Aware Alignment (CAA) for LLM debiasing. Experiments on two text generation tasks with three different alignment objectives demonstrate the advantages of our method in aligning LLMs to generate less biased and safer outputs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xia-etal-2024-aligning">
<titleInfo>
<title>Aligning as Debiasing: Causality-Aware Alignment via Reinforcement Learning with Interventional Feedback</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhankui</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Handong</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julian</namePart>
<namePart type="family">McAuley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuai</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models (LLMs) often generate biased outputs containing offensive, toxic, or stereotypical text. Existing LLM alignment methods such as reinforcement learning from human feedback (RLHF) alleviate biases primarily based on reward signals from current model outputs without considering the source of biases. In this work, to explore how biases are formed, we revisit LLMs’ text generation from a causal perspective. We identify pretraining data and input prompts, which contain semantic correlations of textual phrases, as two confounders between LLMs and model outputs causing biases. Inspired by our causal view, we leverage the reward model in RL alignment as an instrumental variable to perform causal intervention on LLMs. Utilizing the reward difference between an initial LLM and intervened LLM as interventional feedback to guide RL finetuning, we propose Causality-Aware Alignment (CAA) for LLM debiasing. Experiments on two text generation tasks with three different alignment objectives demonstrate the advantages of our method in aligning LLMs to generate less biased and safer outputs.</abstract>
<identifier type="citekey">xia-etal-2024-aligning</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-long.262</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-long.262</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>4684</start>
<end>4695</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Aligning as Debiasing: Causality-Aware Alignment via Reinforcement Learning with Interventional Feedback
%A Xia, Yu
%A Yu, Tong
%A He, Zhankui
%A Zhao, Handong
%A McAuley, Julian
%A Li, Shuai
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F xia-etal-2024-aligning
%X Large language models (LLMs) often generate biased outputs containing offensive, toxic, or stereotypical text. Existing LLM alignment methods such as reinforcement learning from human feedback (RLHF) alleviate biases primarily based on reward signals from current model outputs without considering the source of biases. In this work, to explore how biases are formed, we revisit LLMs’ text generation from a causal perspective. We identify pretraining data and input prompts, which contain semantic correlations of textual phrases, as two confounders between LLMs and model outputs causing biases. Inspired by our causal view, we leverage the reward model in RL alignment as an instrumental variable to perform causal intervention on LLMs. Utilizing the reward difference between an initial LLM and intervened LLM as interventional feedback to guide RL finetuning, we propose Causality-Aware Alignment (CAA) for LLM debiasing. Experiments on two text generation tasks with three different alignment objectives demonstrate the advantages of our method in aligning LLMs to generate less biased and safer outputs.
%R 10.18653/v1/2024.naacl-long.262
%U https://aclanthology.org/2024.naacl-long.262
%U https://doi.org/10.18653/v1/2024.naacl-long.262
%P 4684-4695
Markdown (Informal)
[Aligning as Debiasing: Causality-Aware Alignment via Reinforcement Learning with Interventional Feedback](https://aclanthology.org/2024.naacl-long.262) (Xia et al., NAACL 2024)
ACL