@inproceedings{raza-etal-2024-mbias,
title = "{MBIAS}: Mitigating Bias in Large Language Models While Retaining Context",
author = "Raza, Shaina and
Raval, Ananya and
Chatrath, Veronica",
editor = "De Clercq, Orph{\'e}e and
Barriere, Valentin and
Barnes, Jeremy and
Klinger, Roman and
Sedoc, Jo{\~a}o and
Tafreshi, Shabnam",
booktitle = "Proceedings of the 14th Workshop on Computational Approaches to Subjectivity, Sentiment, {\&} Social Media Analysis",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wassa-1.9",
pages = "97--111",
abstract = "The deployment of Large Language Models (LLMs) in diverse applications necessitates an assurance of safety without compromising the contextual integrity of the generated content. Traditional approaches, including safety-specific fine-tuning or adversarial testing, often yield safe outputs at the expense of contextual meaning. This can result in a diminished capacity to handle nuanced aspects of bias and toxicity, such as underrepresentation or negative portrayals across various demographics. To address these challenges, we introduce MBIAS, an LLM framework carefully instruction fine-tuned on a custom dataset designed specifically for safety interventions. MBIAS is designed to significantly reduce biases and toxic elements in LLM outputs while preserving the main information. This work also details our further use of LLMs: as annotator under human supervision and as evaluator of generated content. Empirical analysis reveals that MBIAS achieves a reduction in bias and toxicity by over 30{\%} in standard evaluations, and by more than 90{\%} in diverse demographic tests, highlighting the robustness of our approach. We make the dataset and the fine-tuned MBIAS model available to the research community for further investigation and to ensure reproducibility. The code for this project can be accessed here https://github.com/shainarazavi/MBIAS.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="raza-etal-2024-mbias">
<titleInfo>
<title>MBIAS: Mitigating Bias in Large Language Models While Retaining Context</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shaina</namePart>
<namePart type="family">Raza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ananya</namePart>
<namePart type="family">Raval</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronica</namePart>
<namePart type="family">Chatrath</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th Workshop on Computational Approaches to Subjectivity, Sentiment, & Social Media Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Orphée</namePart>
<namePart type="family">De Clercq</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valentin</namePart>
<namePart type="family">Barriere</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jeremy</namePart>
<namePart type="family">Barnes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Klinger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shabnam</namePart>
<namePart type="family">Tafreshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The deployment of Large Language Models (LLMs) in diverse applications necessitates an assurance of safety without compromising the contextual integrity of the generated content. Traditional approaches, including safety-specific fine-tuning or adversarial testing, often yield safe outputs at the expense of contextual meaning. This can result in a diminished capacity to handle nuanced aspects of bias and toxicity, such as underrepresentation or negative portrayals across various demographics. To address these challenges, we introduce MBIAS, an LLM framework carefully instruction fine-tuned on a custom dataset designed specifically for safety interventions. MBIAS is designed to significantly reduce biases and toxic elements in LLM outputs while preserving the main information. This work also details our further use of LLMs: as annotator under human supervision and as evaluator of generated content. Empirical analysis reveals that MBIAS achieves a reduction in bias and toxicity by over 30% in standard evaluations, and by more than 90% in diverse demographic tests, highlighting the robustness of our approach. We make the dataset and the fine-tuned MBIAS model available to the research community for further investigation and to ensure reproducibility. The code for this project can be accessed here https://github.com/shainarazavi/MBIAS.</abstract>
<identifier type="citekey">raza-etal-2024-mbias</identifier>
<location>
<url>https://aclanthology.org/2024.wassa-1.9</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>97</start>
<end>111</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MBIAS: Mitigating Bias in Large Language Models While Retaining Context
%A Raza, Shaina
%A Raval, Ananya
%A Chatrath, Veronica
%Y De Clercq, Orphée
%Y Barriere, Valentin
%Y Barnes, Jeremy
%Y Klinger, Roman
%Y Sedoc, João
%Y Tafreshi, Shabnam
%S Proceedings of the 14th Workshop on Computational Approaches to Subjectivity, Sentiment, & Social Media Analysis
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F raza-etal-2024-mbias
%X The deployment of Large Language Models (LLMs) in diverse applications necessitates an assurance of safety without compromising the contextual integrity of the generated content. Traditional approaches, including safety-specific fine-tuning or adversarial testing, often yield safe outputs at the expense of contextual meaning. This can result in a diminished capacity to handle nuanced aspects of bias and toxicity, such as underrepresentation or negative portrayals across various demographics. To address these challenges, we introduce MBIAS, an LLM framework carefully instruction fine-tuned on a custom dataset designed specifically for safety interventions. MBIAS is designed to significantly reduce biases and toxic elements in LLM outputs while preserving the main information. This work also details our further use of LLMs: as annotator under human supervision and as evaluator of generated content. Empirical analysis reveals that MBIAS achieves a reduction in bias and toxicity by over 30% in standard evaluations, and by more than 90% in diverse demographic tests, highlighting the robustness of our approach. We make the dataset and the fine-tuned MBIAS model available to the research community for further investigation and to ensure reproducibility. The code for this project can be accessed here https://github.com/shainarazavi/MBIAS.
%U https://aclanthology.org/2024.wassa-1.9
%P 97-111
Markdown (Informal)
[MBIAS: Mitigating Bias in Large Language Models While Retaining Context](https://aclanthology.org/2024.wassa-1.9) (Raza et al., WASSA-WS 2024)
ACL