@inproceedings{mamta-etal-2024-biaswipe,
title = "{B}ias{W}ipe: Mitigating Unintended Bias in Text Classifiers through Model Interpretability",
author = "Mamta, Mamta and
Chigrupaatii, Rishikant and
Ekbal, Asif",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1172",
pages = "21059--21070",
abstract = "Toxic content detection plays a vital role in addressing the misuse of social media platforms to harm people or groups due to their race, gender or ethnicity. However, due to the nature of the datasets, systems develop an unintended bias due to the over-generalization of the model to the training data. This compromises the fairness of the systems, which can impact certain groups due to their race, gender, etc.Existing methods mitigate bias using data augmentation, adversarial learning, etc., which require re-training and adding extra parameters to the model.In this work, we present a robust and generalizable technique \textit{BiasWipe} to mitigate unintended bias in language models. \textit{BiasWipe} utilizes model interpretability using Shapley values, which achieve fairness by pruning the neuron weights responsible for unintended bias. It first identifies the neuron weights responsible for unintended bias and then achieves fairness by pruning them without loss of original performance. It does not require re-training or adding extra parameters to the model. To show the effectiveness of our proposed technique for bias unlearning, we perform extensive experiments for Toxic content detection for BERT, RoBERTa, and GPT models. .",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mamta-etal-2024-biaswipe">
<titleInfo>
<title>BiasWipe: Mitigating Unintended Bias in Text Classifiers through Model Interpretability</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mamta</namePart>
<namePart type="family">Mamta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rishikant</namePart>
<namePart type="family">Chigrupaatii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asif</namePart>
<namePart type="family">Ekbal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Toxic content detection plays a vital role in addressing the misuse of social media platforms to harm people or groups due to their race, gender or ethnicity. However, due to the nature of the datasets, systems develop an unintended bias due to the over-generalization of the model to the training data. This compromises the fairness of the systems, which can impact certain groups due to their race, gender, etc.Existing methods mitigate bias using data augmentation, adversarial learning, etc., which require re-training and adding extra parameters to the model.In this work, we present a robust and generalizable technique BiasWipe to mitigate unintended bias in language models. BiasWipe utilizes model interpretability using Shapley values, which achieve fairness by pruning the neuron weights responsible for unintended bias. It first identifies the neuron weights responsible for unintended bias and then achieves fairness by pruning them without loss of original performance. It does not require re-training or adding extra parameters to the model. To show the effectiveness of our proposed technique for bias unlearning, we perform extensive experiments for Toxic content detection for BERT, RoBERTa, and GPT models. .</abstract>
<identifier type="citekey">mamta-etal-2024-biaswipe</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1172</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>21059</start>
<end>21070</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BiasWipe: Mitigating Unintended Bias in Text Classifiers through Model Interpretability
%A Mamta, Mamta
%A Chigrupaatii, Rishikant
%A Ekbal, Asif
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F mamta-etal-2024-biaswipe
%X Toxic content detection plays a vital role in addressing the misuse of social media platforms to harm people or groups due to their race, gender or ethnicity. However, due to the nature of the datasets, systems develop an unintended bias due to the over-generalization of the model to the training data. This compromises the fairness of the systems, which can impact certain groups due to their race, gender, etc.Existing methods mitigate bias using data augmentation, adversarial learning, etc., which require re-training and adding extra parameters to the model.In this work, we present a robust and generalizable technique BiasWipe to mitigate unintended bias in language models. BiasWipe utilizes model interpretability using Shapley values, which achieve fairness by pruning the neuron weights responsible for unintended bias. It first identifies the neuron weights responsible for unintended bias and then achieves fairness by pruning them without loss of original performance. It does not require re-training or adding extra parameters to the model. To show the effectiveness of our proposed technique for bias unlearning, we perform extensive experiments for Toxic content detection for BERT, RoBERTa, and GPT models. .
%U https://aclanthology.org/2024.emnlp-main.1172
%P 21059-21070
Markdown (Informal)
[BiasWipe: Mitigating Unintended Bias in Text Classifiers through Model Interpretability](https://aclanthology.org/2024.emnlp-main.1172) (Mamta et al., EMNLP 2024)
ACL