@article{schick-etal-2021-self,
title = "Self-Diagnosis and Self-Debiasing: A Proposal for Reducing Corpus-Based Bias in {NLP}",
author = {Schick, Timo and
Udupa, Sahana and
Sch{\"u}tze, Hinrich},
editor = "Roark, Brian and
Nenkova, Ani",
journal = "Transactions of the Association for Computational Linguistics",
volume = "9",
year = "2021",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2021.tacl-1.84",
doi = "10.1162/tacl_a_00434",
pages = "1408--1424",
abstract = "This paper contains prompts and model outputs that are offensive in nature. When trained on large, unfiltered crawls from the Internet, language models pick up and reproduce all kinds of undesirable biases that can be found in the data: They often generate racist, sexist, violent, or otherwise toxic language. As large models require millions of training examples to achieve good performance, it is difficult to completely prevent them from being exposed to such content. In this paper, we first demonstrate a surprising finding: Pretrained language models recognize, to a considerable degree, their undesirable biases and the toxicity of the content they produce. We refer to this capability as self-diagnosis. Based on this finding, we then propose a decoding algorithm that, given only a textual description of the undesired behavior, reduces the probability of a language model producing problematic text. We refer to this approach as self-debiasing. Self-debiasing does not rely on manually curated word lists, nor does it require any training data or changes to the model{'}s parameters. While we by no means eliminate the issue of language models generating biased text, we believe our approach to be an important step in this direction.1",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="schick-etal-2021-self">
<titleInfo>
<title>Self-Diagnosis and Self-Debiasing: A Proposal for Reducing Corpus-Based Bias in NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Timo</namePart>
<namePart type="family">Schick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sahana</namePart>
<namePart type="family">Udupa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hinrich</namePart>
<namePart type="family">Schütze</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>This paper contains prompts and model outputs that are offensive in nature. When trained on large, unfiltered crawls from the Internet, language models pick up and reproduce all kinds of undesirable biases that can be found in the data: They often generate racist, sexist, violent, or otherwise toxic language. As large models require millions of training examples to achieve good performance, it is difficult to completely prevent them from being exposed to such content. In this paper, we first demonstrate a surprising finding: Pretrained language models recognize, to a considerable degree, their undesirable biases and the toxicity of the content they produce. We refer to this capability as self-diagnosis. Based on this finding, we then propose a decoding algorithm that, given only a textual description of the undesired behavior, reduces the probability of a language model producing problematic text. We refer to this approach as self-debiasing. Self-debiasing does not rely on manually curated word lists, nor does it require any training data or changes to the model’s parameters. While we by no means eliminate the issue of language models generating biased text, we believe our approach to be an important step in this direction.1</abstract>
<identifier type="citekey">schick-etal-2021-self</identifier>
<identifier type="doi">10.1162/tacl_a_00434</identifier>
<location>
<url>https://aclanthology.org/2021.tacl-1.84</url>
</location>
<part>
<date>2021</date>
<detail type="volume"><number>9</number></detail>
<extent unit="page">
<start>1408</start>
<end>1424</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Self-Diagnosis and Self-Debiasing: A Proposal for Reducing Corpus-Based Bias in NLP
%A Schick, Timo
%A Udupa, Sahana
%A Schütze, Hinrich
%J Transactions of the Association for Computational Linguistics
%D 2021
%V 9
%I MIT Press
%C Cambridge, MA
%F schick-etal-2021-self
%X This paper contains prompts and model outputs that are offensive in nature. When trained on large, unfiltered crawls from the Internet, language models pick up and reproduce all kinds of undesirable biases that can be found in the data: They often generate racist, sexist, violent, or otherwise toxic language. As large models require millions of training examples to achieve good performance, it is difficult to completely prevent them from being exposed to such content. In this paper, we first demonstrate a surprising finding: Pretrained language models recognize, to a considerable degree, their undesirable biases and the toxicity of the content they produce. We refer to this capability as self-diagnosis. Based on this finding, we then propose a decoding algorithm that, given only a textual description of the undesired behavior, reduces the probability of a language model producing problematic text. We refer to this approach as self-debiasing. Self-debiasing does not rely on manually curated word lists, nor does it require any training data or changes to the model’s parameters. While we by no means eliminate the issue of language models generating biased text, we believe our approach to be an important step in this direction.1
%R 10.1162/tacl_a_00434
%U https://aclanthology.org/2021.tacl-1.84
%U https://doi.org/10.1162/tacl_a_00434
%P 1408-1424
Markdown (Informal)
[Self-Diagnosis and Self-Debiasing: A Proposal for Reducing Corpus-Based Bias in NLP](https://aclanthology.org/2021.tacl-1.84) (Schick et al., TACL 2021)
ACL