@inproceedings{dige-etal-2024-machine,
title = "Can Machine Unlearning Reduce Social Bias in Language Models?",
author = "Dige, Omkar and
Arneja, Diljot and
Yau, Tsz Fung and
Zhang, Qixuan and
Bolandraftar, Mohammad and
Zhu, Xiaodan and
Khattak, Faiza Khan",
editor = "Dernoncourt, Franck and
Preo{\c{t}}iuc-Pietro, Daniel and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-industry.71",
pages = "954--969",
abstract = "Mitigating bias in language models (LMs) has become a critical problem due to the widespread deployment of LMs in the industry and customer-facing applications. Numerous approaches revolve around data pre-processing and subsequent fine-tuning of language models, tasks that can be both time-consuming and computationally demanding. As alternatives, machine unlearning techniques are being explored, yet there is a notable lack of comparative studies evaluating the effectiveness of these methods. In this work, we explore the effectiveness of two machine unlearning methods: Partitioned Contrastive Gradient Unlearning (PCGU) applied on decoder models, and Negation via Task Vector, and compare them with Direct Preference Optimization (DPO) to reduce social biases in open-source LMs such as LLaMA-2 and OPT. We also implement distributed PCGU for large models. It is empirically shown, through quantitative and qualitative analyses, that negation via Task Vector method outperforms PCGU and is comparable to DPO in debiasing models with minimum deterioration in model performance and perplexity. Negation via Task Vector reduces the bias score by 25.5{\%} for LLaMA-2 and achieves bias reduction of up to 40{\%} for OPT models. Moreover, it can be easily tuned to balance the trade-off between bias reduction and generation quality, unlike DPO.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dige-etal-2024-machine">
<titleInfo>
<title>Can Machine Unlearning Reduce Social Bias in Language Models?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Omkar</namePart>
<namePart type="family">Dige</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diljot</namePart>
<namePart type="family">Arneja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tsz</namePart>
<namePart type="given">Fung</namePart>
<namePart type="family">Yau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qixuan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="family">Bolandraftar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaodan</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Faiza</namePart>
<namePart type="given">Khan</namePart>
<namePart type="family">Khattak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoţiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Mitigating bias in language models (LMs) has become a critical problem due to the widespread deployment of LMs in the industry and customer-facing applications. Numerous approaches revolve around data pre-processing and subsequent fine-tuning of language models, tasks that can be both time-consuming and computationally demanding. As alternatives, machine unlearning techniques are being explored, yet there is a notable lack of comparative studies evaluating the effectiveness of these methods. In this work, we explore the effectiveness of two machine unlearning methods: Partitioned Contrastive Gradient Unlearning (PCGU) applied on decoder models, and Negation via Task Vector, and compare them with Direct Preference Optimization (DPO) to reduce social biases in open-source LMs such as LLaMA-2 and OPT. We also implement distributed PCGU for large models. It is empirically shown, through quantitative and qualitative analyses, that negation via Task Vector method outperforms PCGU and is comparable to DPO in debiasing models with minimum deterioration in model performance and perplexity. Negation via Task Vector reduces the bias score by 25.5% for LLaMA-2 and achieves bias reduction of up to 40% for OPT models. Moreover, it can be easily tuned to balance the trade-off between bias reduction and generation quality, unlike DPO.</abstract>
<identifier type="citekey">dige-etal-2024-machine</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-industry.71</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>954</start>
<end>969</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Can Machine Unlearning Reduce Social Bias in Language Models?
%A Dige, Omkar
%A Arneja, Diljot
%A Yau, Tsz Fung
%A Zhang, Qixuan
%A Bolandraftar, Mohammad
%A Zhu, Xiaodan
%A Khattak, Faiza Khan
%Y Dernoncourt, Franck
%Y Preoţiuc-Pietro, Daniel
%Y Shimorina, Anastasia
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F dige-etal-2024-machine
%X Mitigating bias in language models (LMs) has become a critical problem due to the widespread deployment of LMs in the industry and customer-facing applications. Numerous approaches revolve around data pre-processing and subsequent fine-tuning of language models, tasks that can be both time-consuming and computationally demanding. As alternatives, machine unlearning techniques are being explored, yet there is a notable lack of comparative studies evaluating the effectiveness of these methods. In this work, we explore the effectiveness of two machine unlearning methods: Partitioned Contrastive Gradient Unlearning (PCGU) applied on decoder models, and Negation via Task Vector, and compare them with Direct Preference Optimization (DPO) to reduce social biases in open-source LMs such as LLaMA-2 and OPT. We also implement distributed PCGU for large models. It is empirically shown, through quantitative and qualitative analyses, that negation via Task Vector method outperforms PCGU and is comparable to DPO in debiasing models with minimum deterioration in model performance and perplexity. Negation via Task Vector reduces the bias score by 25.5% for LLaMA-2 and achieves bias reduction of up to 40% for OPT models. Moreover, it can be easily tuned to balance the trade-off between bias reduction and generation quality, unlike DPO.
%U https://aclanthology.org/2024.emnlp-industry.71
%P 954-969
Markdown (Informal)
[Can Machine Unlearning Reduce Social Bias in Language Models?](https://aclanthology.org/2024.emnlp-industry.71) (Dige et al., EMNLP 2024)
ACL
- Omkar Dige, Diljot Arneja, Tsz Fung Yau, Qixuan Zhang, Mohammad Bolandraftar, Xiaodan Zhu, and Faiza Khan Khattak. 2024. Can Machine Unlearning Reduce Social Bias in Language Models?. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 954–969, Miami, Florida, US. Association for Computational Linguistics.