@inproceedings{prakash-lee-2023-layered,
title = "Layered Bias: Interpreting Bias in Pretrained Large Language Models",
author = "Prakash, Nirmalendu and
Lee, Roy Ka-Wei",
editor = "Belinkov, Yonatan and
Hao, Sophie and
Jumelet, Jaap and
Kim, Najoung and
McCarthy, Arya and
Mohebbi, Hosein",
booktitle = "Proceedings of the 6th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.blackboxnlp-1.22",
doi = "10.18653/v1/2023.blackboxnlp-1.22",
pages = "284--295",
abstract = "Large language models (LLMs) like GPT and PALM have excelled in numerous natural language processing (NLP) tasks such as text generation, question answering, and translation. However, they are also found to have inherent social biases. To address this, recent studies have proposed debiasing techniques like iterative nullspace projection (INLP) and Counterfactual Data Augmentation (CDA). Additionally, there{'}s growing interest in understanding the intricacies of these models. Some researchers focus on individual neural units, while others examine specific layers. In our study, we benchmark newly released models, assess the impact of debiasing methods, and investigate how biases are linked to different transformer layers using a method called Logit Lens. Specifically, we evaluate three modern LLMs: OPT, LLaMA, and LLaMA2, and their debiased versions. Our experiments are based on two popular bias evaluation datasets, StereoSet and CrowS-Pairs, and we perform a layer-by-layer analysis using the Logit Lens.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="prakash-lee-2023-layered">
<titleInfo>
<title>Layered Bias: Interpreting Bias in Pretrained Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nirmalendu</namePart>
<namePart type="family">Prakash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roy</namePart>
<namePart type="given">Ka-Wei</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Belinkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophie</namePart>
<namePart type="family">Hao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaap</namePart>
<namePart type="family">Jumelet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Najoung</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arya</namePart>
<namePart type="family">McCarthy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hosein</namePart>
<namePart type="family">Mohebbi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models (LLMs) like GPT and PALM have excelled in numerous natural language processing (NLP) tasks such as text generation, question answering, and translation. However, they are also found to have inherent social biases. To address this, recent studies have proposed debiasing techniques like iterative nullspace projection (INLP) and Counterfactual Data Augmentation (CDA). Additionally, there’s growing interest in understanding the intricacies of these models. Some researchers focus on individual neural units, while others examine specific layers. In our study, we benchmark newly released models, assess the impact of debiasing methods, and investigate how biases are linked to different transformer layers using a method called Logit Lens. Specifically, we evaluate three modern LLMs: OPT, LLaMA, and LLaMA2, and their debiased versions. Our experiments are based on two popular bias evaluation datasets, StereoSet and CrowS-Pairs, and we perform a layer-by-layer analysis using the Logit Lens.</abstract>
<identifier type="citekey">prakash-lee-2023-layered</identifier>
<identifier type="doi">10.18653/v1/2023.blackboxnlp-1.22</identifier>
<location>
<url>https://aclanthology.org/2023.blackboxnlp-1.22</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>284</start>
<end>295</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Layered Bias: Interpreting Bias in Pretrained Large Language Models
%A Prakash, Nirmalendu
%A Lee, Roy Ka-Wei
%Y Belinkov, Yonatan
%Y Hao, Sophie
%Y Jumelet, Jaap
%Y Kim, Najoung
%Y McCarthy, Arya
%Y Mohebbi, Hosein
%S Proceedings of the 6th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F prakash-lee-2023-layered
%X Large language models (LLMs) like GPT and PALM have excelled in numerous natural language processing (NLP) tasks such as text generation, question answering, and translation. However, they are also found to have inherent social biases. To address this, recent studies have proposed debiasing techniques like iterative nullspace projection (INLP) and Counterfactual Data Augmentation (CDA). Additionally, there’s growing interest in understanding the intricacies of these models. Some researchers focus on individual neural units, while others examine specific layers. In our study, we benchmark newly released models, assess the impact of debiasing methods, and investigate how biases are linked to different transformer layers using a method called Logit Lens. Specifically, we evaluate three modern LLMs: OPT, LLaMA, and LLaMA2, and their debiased versions. Our experiments are based on two popular bias evaluation datasets, StereoSet and CrowS-Pairs, and we perform a layer-by-layer analysis using the Logit Lens.
%R 10.18653/v1/2023.blackboxnlp-1.22
%U https://aclanthology.org/2023.blackboxnlp-1.22
%U https://doi.org/10.18653/v1/2023.blackboxnlp-1.22
%P 284-295
Markdown (Informal)
[Layered Bias: Interpreting Bias in Pretrained Large Language Models](https://aclanthology.org/2023.blackboxnlp-1.22) (Prakash & Lee, BlackboxNLP-WS 2023)
ACL