@inproceedings{sturman-etal-2024-debiasing,
title = "Debiasing Text Safety Classifiers through a Fairness-Aware Ensemble",
author = "Sturman, Olivia and
Joshi, Aparna R and
Radharapu, Bhaktipriya and
Kumar, Piyush and
Shelby, Renee",
editor = "Dernoncourt, Franck and
Preo{\c{t}}iuc-Pietro, Daniel and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-industry.16",
pages = "199--214",
abstract = "Increasing use of large language models (LLMs) demand performant guardrails to ensure the safety of inputs and outputs of LLMs. When these safeguards are trained on imbalanced data, they can learn the societal biases. We present a light-weight, post-processing method for mitigating counterfactual fairness in closed-source text safety classifiers. Our approach involves building an ensemble that not only outperforms the input classifiers and policy-aligns them, but also acts as a debiasing regularizer. We introduce two threshold-agnostic metrics to assess the counterfactual fairness of a model, and demonstrate how combining these metrics with Fair Data Reweighting (FDW) helps mitigate biases. We create an expanded Open AI dataset, and a new templated LLM-generated dataset based on user-prompts, both of which are counterfactually balanced across identity groups and cover four key areas of safety; we will work towards publicly releasing these datasets. Our results show that our approach improves counterfactual fairness with minimal impact on model performance.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sturman-etal-2024-debiasing">
<titleInfo>
<title>Debiasing Text Safety Classifiers through a Fairness-Aware Ensemble</title>
</titleInfo>
<name type="personal">
<namePart type="given">Olivia</namePart>
<namePart type="family">Sturman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aparna</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Joshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bhaktipriya</namePart>
<namePart type="family">Radharapu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Piyush</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Renee</namePart>
<namePart type="family">Shelby</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoţiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Increasing use of large language models (LLMs) demand performant guardrails to ensure the safety of inputs and outputs of LLMs. When these safeguards are trained on imbalanced data, they can learn the societal biases. We present a light-weight, post-processing method for mitigating counterfactual fairness in closed-source text safety classifiers. Our approach involves building an ensemble that not only outperforms the input classifiers and policy-aligns them, but also acts as a debiasing regularizer. We introduce two threshold-agnostic metrics to assess the counterfactual fairness of a model, and demonstrate how combining these metrics with Fair Data Reweighting (FDW) helps mitigate biases. We create an expanded Open AI dataset, and a new templated LLM-generated dataset based on user-prompts, both of which are counterfactually balanced across identity groups and cover four key areas of safety; we will work towards publicly releasing these datasets. Our results show that our approach improves counterfactual fairness with minimal impact on model performance.</abstract>
<identifier type="citekey">sturman-etal-2024-debiasing</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-industry.16</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>199</start>
<end>214</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Debiasing Text Safety Classifiers through a Fairness-Aware Ensemble
%A Sturman, Olivia
%A Joshi, Aparna R.
%A Radharapu, Bhaktipriya
%A Kumar, Piyush
%A Shelby, Renee
%Y Dernoncourt, Franck
%Y Preoţiuc-Pietro, Daniel
%Y Shimorina, Anastasia
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F sturman-etal-2024-debiasing
%X Increasing use of large language models (LLMs) demand performant guardrails to ensure the safety of inputs and outputs of LLMs. When these safeguards are trained on imbalanced data, they can learn the societal biases. We present a light-weight, post-processing method for mitigating counterfactual fairness in closed-source text safety classifiers. Our approach involves building an ensemble that not only outperforms the input classifiers and policy-aligns them, but also acts as a debiasing regularizer. We introduce two threshold-agnostic metrics to assess the counterfactual fairness of a model, and demonstrate how combining these metrics with Fair Data Reweighting (FDW) helps mitigate biases. We create an expanded Open AI dataset, and a new templated LLM-generated dataset based on user-prompts, both of which are counterfactually balanced across identity groups and cover four key areas of safety; we will work towards publicly releasing these datasets. Our results show that our approach improves counterfactual fairness with minimal impact on model performance.
%U https://aclanthology.org/2024.emnlp-industry.16
%P 199-214
Markdown (Informal)
[Debiasing Text Safety Classifiers through a Fairness-Aware Ensemble](https://aclanthology.org/2024.emnlp-industry.16) (Sturman et al., EMNLP 2024)
ACL
- Olivia Sturman, Aparna R Joshi, Bhaktipriya Radharapu, Piyush Kumar, and Renee Shelby. 2024. Debiasing Text Safety Classifiers through a Fairness-Aware Ensemble. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 199–214, Miami, Florida, US. Association for Computational Linguistics.