@inproceedings{kwon-mihindukulasooriya-2022-empirical,
title = "An Empirical Study on Pseudo-log-likelihood Bias Measures for Masked Language Models Using Paraphrased Sentences",
author = "Kwon, Bum Chul and
Mihindukulasooriya, Nandana",
editor = "Verma, Apurv and
Pruksachatkun, Yada and
Chang, Kai-Wei and
Galstyan, Aram and
Dhamala, Jwala and
Cao, Yang Trista",
booktitle = "Proceedings of the 2nd Workshop on Trustworthy Natural Language Processing (TrustNLP 2022)",
month = jul,
year = "2022",
address = "Seattle, U.S.A.",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.trustnlp-1.7",
doi = "10.18653/v1/2022.trustnlp-1.7",
pages = "74--79",
abstract = "In this paper, we conduct an empirical study on a bias measure, log-likelihood Masked Language Model (MLM) scoring, on a benchmark dataset. Previous work evaluates whether MLMs are biased or not for certain protected attributes (e.g., race) by comparing the log-likelihood scores of sentences that contain stereotypical characteristics with one category (e.g., black) versus another (e.g., white). We hypothesized that this approach might be too sensitive to the choice of contextual words than the meaning of the sentence. Therefore, we computed the same measure after paraphrasing the sentences with different words but with same meaning. Our results demonstrate that the log-likelihood scoring can be more sensitive to utterance of specific words than to meaning behind a given sentence. Our paper reveals a shortcoming of the current log-likelihood-based bias measures for MLMs and calls for new ways to improve the robustness of it",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kwon-mihindukulasooriya-2022-empirical">
<titleInfo>
<title>An Empirical Study on Pseudo-log-likelihood Bias Measures for Masked Language Models Using Paraphrased Sentences</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bum</namePart>
<namePart type="given">Chul</namePart>
<namePart type="family">Kwon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nandana</namePart>
<namePart type="family">Mihindukulasooriya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Trustworthy Natural Language Processing (TrustNLP 2022)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Apurv</namePart>
<namePart type="family">Verma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yada</namePart>
<namePart type="family">Pruksachatkun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aram</namePart>
<namePart type="family">Galstyan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jwala</namePart>
<namePart type="family">Dhamala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="given">Trista</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, U.S.A.</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we conduct an empirical study on a bias measure, log-likelihood Masked Language Model (MLM) scoring, on a benchmark dataset. Previous work evaluates whether MLMs are biased or not for certain protected attributes (e.g., race) by comparing the log-likelihood scores of sentences that contain stereotypical characteristics with one category (e.g., black) versus another (e.g., white). We hypothesized that this approach might be too sensitive to the choice of contextual words than the meaning of the sentence. Therefore, we computed the same measure after paraphrasing the sentences with different words but with same meaning. Our results demonstrate that the log-likelihood scoring can be more sensitive to utterance of specific words than to meaning behind a given sentence. Our paper reveals a shortcoming of the current log-likelihood-based bias measures for MLMs and calls for new ways to improve the robustness of it</abstract>
<identifier type="citekey">kwon-mihindukulasooriya-2022-empirical</identifier>
<identifier type="doi">10.18653/v1/2022.trustnlp-1.7</identifier>
<location>
<url>https://aclanthology.org/2022.trustnlp-1.7</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>74</start>
<end>79</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Empirical Study on Pseudo-log-likelihood Bias Measures for Masked Language Models Using Paraphrased Sentences
%A Kwon, Bum Chul
%A Mihindukulasooriya, Nandana
%Y Verma, Apurv
%Y Pruksachatkun, Yada
%Y Chang, Kai-Wei
%Y Galstyan, Aram
%Y Dhamala, Jwala
%Y Cao, Yang Trista
%S Proceedings of the 2nd Workshop on Trustworthy Natural Language Processing (TrustNLP 2022)
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, U.S.A.
%F kwon-mihindukulasooriya-2022-empirical
%X In this paper, we conduct an empirical study on a bias measure, log-likelihood Masked Language Model (MLM) scoring, on a benchmark dataset. Previous work evaluates whether MLMs are biased or not for certain protected attributes (e.g., race) by comparing the log-likelihood scores of sentences that contain stereotypical characteristics with one category (e.g., black) versus another (e.g., white). We hypothesized that this approach might be too sensitive to the choice of contextual words than the meaning of the sentence. Therefore, we computed the same measure after paraphrasing the sentences with different words but with same meaning. Our results demonstrate that the log-likelihood scoring can be more sensitive to utterance of specific words than to meaning behind a given sentence. Our paper reveals a shortcoming of the current log-likelihood-based bias measures for MLMs and calls for new ways to improve the robustness of it
%R 10.18653/v1/2022.trustnlp-1.7
%U https://aclanthology.org/2022.trustnlp-1.7
%U https://doi.org/10.18653/v1/2022.trustnlp-1.7
%P 74-79
Markdown (Informal)
[An Empirical Study on Pseudo-log-likelihood Bias Measures for Masked Language Models Using Paraphrased Sentences](https://aclanthology.org/2022.trustnlp-1.7) (Kwon & Mihindukulasooriya, TrustNLP 2022)
ACL