@inproceedings{azarpanah-farhadloo-2021-measuring,
title = "Measuring Biases of Word Embeddings: What Similarity Measures and Descriptive Statistics to Use?",
author = "Azarpanah, Hossein and
Farhadloo, Mohsen",
editor = "Pruksachatkun, Yada and
Ramakrishna, Anil and
Chang, Kai-Wei and
Krishna, Satyapriya and
Dhamala, Jwala and
Guha, Tanaya and
Ren, Xiang",
booktitle = "Proceedings of the First Workshop on Trustworthy Natural Language Processing",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.trustnlp-1.2",
doi = "10.18653/v1/2021.trustnlp-1.2",
pages = "8--14",
abstract = "Word embeddings are widely used in Natural Language Processing (NLP) for a vast range of applications. However, it has been consistently proven that these embeddings reflect the same human biases that exist in the data used to train them. Most of the introduced bias indicators to reveal word embeddings{'} bias are average-based indicators based on the cosine similarity measure. In this study, we examine the impacts of different similarity measures as well as other descriptive techniques than averaging in measuring the biases of contextual and non-contextual word embeddings. We show that the extent of revealed biases in word embeddings depends on the descriptive statistics and similarity measures used to measure the bias. We found that over the ten categories of word embedding association tests, Mahalanobis distance reveals the smallest bias, and Euclidean distance reveals the largest bias in word embeddings. In addition, the contextual models reveal less severe biases than the non-contextual word embedding models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="azarpanah-farhadloo-2021-measuring">
<titleInfo>
<title>Measuring Biases of Word Embeddings: What Similarity Measures and Descriptive Statistics to Use?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hossein</namePart>
<namePart type="family">Azarpanah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohsen</namePart>
<namePart type="family">Farhadloo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Trustworthy Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yada</namePart>
<namePart type="family">Pruksachatkun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anil</namePart>
<namePart type="family">Ramakrishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Satyapriya</namePart>
<namePart type="family">Krishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jwala</namePart>
<namePart type="family">Dhamala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanaya</namePart>
<namePart type="family">Guha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Word embeddings are widely used in Natural Language Processing (NLP) for a vast range of applications. However, it has been consistently proven that these embeddings reflect the same human biases that exist in the data used to train them. Most of the introduced bias indicators to reveal word embeddings’ bias are average-based indicators based on the cosine similarity measure. In this study, we examine the impacts of different similarity measures as well as other descriptive techniques than averaging in measuring the biases of contextual and non-contextual word embeddings. We show that the extent of revealed biases in word embeddings depends on the descriptive statistics and similarity measures used to measure the bias. We found that over the ten categories of word embedding association tests, Mahalanobis distance reveals the smallest bias, and Euclidean distance reveals the largest bias in word embeddings. In addition, the contextual models reveal less severe biases than the non-contextual word embedding models.</abstract>
<identifier type="citekey">azarpanah-farhadloo-2021-measuring</identifier>
<identifier type="doi">10.18653/v1/2021.trustnlp-1.2</identifier>
<location>
<url>https://aclanthology.org/2021.trustnlp-1.2</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>8</start>
<end>14</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Measuring Biases of Word Embeddings: What Similarity Measures and Descriptive Statistics to Use?
%A Azarpanah, Hossein
%A Farhadloo, Mohsen
%Y Pruksachatkun, Yada
%Y Ramakrishna, Anil
%Y Chang, Kai-Wei
%Y Krishna, Satyapriya
%Y Dhamala, Jwala
%Y Guha, Tanaya
%Y Ren, Xiang
%S Proceedings of the First Workshop on Trustworthy Natural Language Processing
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F azarpanah-farhadloo-2021-measuring
%X Word embeddings are widely used in Natural Language Processing (NLP) for a vast range of applications. However, it has been consistently proven that these embeddings reflect the same human biases that exist in the data used to train them. Most of the introduced bias indicators to reveal word embeddings’ bias are average-based indicators based on the cosine similarity measure. In this study, we examine the impacts of different similarity measures as well as other descriptive techniques than averaging in measuring the biases of contextual and non-contextual word embeddings. We show that the extent of revealed biases in word embeddings depends on the descriptive statistics and similarity measures used to measure the bias. We found that over the ten categories of word embedding association tests, Mahalanobis distance reveals the smallest bias, and Euclidean distance reveals the largest bias in word embeddings. In addition, the contextual models reveal less severe biases than the non-contextual word embedding models.
%R 10.18653/v1/2021.trustnlp-1.2
%U https://aclanthology.org/2021.trustnlp-1.2
%U https://doi.org/10.18653/v1/2021.trustnlp-1.2
%P 8-14
Markdown (Informal)
[Measuring Biases of Word Embeddings: What Similarity Measures and Descriptive Statistics to Use?](https://aclanthology.org/2021.trustnlp-1.2) (Azarpanah & Farhadloo, TrustNLP 2021)
ACL