@inproceedings{setzu-etal-2024-fairbelief,
title = "{F}air{B}elief - Assessing Harmful Beliefs in Language Models",
author = "Setzu, Mattia and
Marchiori Manerba, Marta and
Minervini, Pasquale and
Nozza, Debora",
editor = "Ovalle, Anaelia and
Chang, Kai-Wei and
Cao, Yang Trista and
Mehrabi, Ninareh and
Zhao, Jieyu and
Galstyan, Aram and
Dhamala, Jwala and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 4th Workshop on Trustworthy Natural Language Processing (TrustNLP 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.trustnlp-1.3",
doi = "10.18653/v1/2024.trustnlp-1.3",
pages = "27--39",
abstract = "Language Models (LMs) have been shown to inherit undesired biases that might hurt minorities and underrepresented groups if such systems were integrated into real-world applications without careful fairness auditing.This paper proposes FairBelief, an analytical approach to capture and assess beliefs, i.e., propositions that an LM may embed with different degrees of confidence and that covertly influence its predictions. With FairBelief, we leverage prompting to study the behavior of several state-of-the-art LMs across different previously neglected axes, such as model scale and likelihood, assessing predictions on a fairness dataset specifically designed to quantify LMs{'} outputs{'} hurtfulness.Finally, we conclude with an in-depth qualitative assessment of the beliefs emitted by the models.We apply FairBelief to English LMs, revealing that, although these architectures enable high performances on diverse natural language processing tasks, they show hurtful beliefs about specific genders. Interestingly, training procedure and dataset, model scale, and architecture induce beliefs of different degrees of hurtfulness.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="setzu-etal-2024-fairbelief">
<titleInfo>
<title>FairBelief - Assessing Harmful Beliefs in Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mattia</namePart>
<namePart type="family">Setzu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="family">Marchiori Manerba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pasquale</namePart>
<namePart type="family">Minervini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debora</namePart>
<namePart type="family">Nozza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Trustworthy Natural Language Processing (TrustNLP 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anaelia</namePart>
<namePart type="family">Ovalle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="given">Trista</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ninareh</namePart>
<namePart type="family">Mehrabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jieyu</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aram</namePart>
<namePart type="family">Galstyan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jwala</namePart>
<namePart type="family">Dhamala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language Models (LMs) have been shown to inherit undesired biases that might hurt minorities and underrepresented groups if such systems were integrated into real-world applications without careful fairness auditing.This paper proposes FairBelief, an analytical approach to capture and assess beliefs, i.e., propositions that an LM may embed with different degrees of confidence and that covertly influence its predictions. With FairBelief, we leverage prompting to study the behavior of several state-of-the-art LMs across different previously neglected axes, such as model scale and likelihood, assessing predictions on a fairness dataset specifically designed to quantify LMs’ outputs’ hurtfulness.Finally, we conclude with an in-depth qualitative assessment of the beliefs emitted by the models.We apply FairBelief to English LMs, revealing that, although these architectures enable high performances on diverse natural language processing tasks, they show hurtful beliefs about specific genders. Interestingly, training procedure and dataset, model scale, and architecture induce beliefs of different degrees of hurtfulness.</abstract>
<identifier type="citekey">setzu-etal-2024-fairbelief</identifier>
<identifier type="doi">10.18653/v1/2024.trustnlp-1.3</identifier>
<location>
<url>https://aclanthology.org/2024.trustnlp-1.3</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>27</start>
<end>39</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FairBelief - Assessing Harmful Beliefs in Language Models
%A Setzu, Mattia
%A Marchiori Manerba, Marta
%A Minervini, Pasquale
%A Nozza, Debora
%Y Ovalle, Anaelia
%Y Chang, Kai-Wei
%Y Cao, Yang Trista
%Y Mehrabi, Ninareh
%Y Zhao, Jieyu
%Y Galstyan, Aram
%Y Dhamala, Jwala
%Y Kumar, Anoop
%Y Gupta, Rahul
%S Proceedings of the 4th Workshop on Trustworthy Natural Language Processing (TrustNLP 2024)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F setzu-etal-2024-fairbelief
%X Language Models (LMs) have been shown to inherit undesired biases that might hurt minorities and underrepresented groups if such systems were integrated into real-world applications without careful fairness auditing.This paper proposes FairBelief, an analytical approach to capture and assess beliefs, i.e., propositions that an LM may embed with different degrees of confidence and that covertly influence its predictions. With FairBelief, we leverage prompting to study the behavior of several state-of-the-art LMs across different previously neglected axes, such as model scale and likelihood, assessing predictions on a fairness dataset specifically designed to quantify LMs’ outputs’ hurtfulness.Finally, we conclude with an in-depth qualitative assessment of the beliefs emitted by the models.We apply FairBelief to English LMs, revealing that, although these architectures enable high performances on diverse natural language processing tasks, they show hurtful beliefs about specific genders. Interestingly, training procedure and dataset, model scale, and architecture induce beliefs of different degrees of hurtfulness.
%R 10.18653/v1/2024.trustnlp-1.3
%U https://aclanthology.org/2024.trustnlp-1.3
%U https://doi.org/10.18653/v1/2024.trustnlp-1.3
%P 27-39
Markdown (Informal)
[FairBelief - Assessing Harmful Beliefs in Language Models](https://aclanthology.org/2024.trustnlp-1.3) (Setzu et al., TrustNLP-WS 2024)
ACL
- Mattia Setzu, Marta Marchiori Manerba, Pasquale Minervini, and Debora Nozza. 2024. FairBelief - Assessing Harmful Beliefs in Language Models. In Proceedings of the 4th Workshop on Trustworthy Natural Language Processing (TrustNLP 2024), pages 27–39, Mexico City, Mexico. Association for Computational Linguistics.