@inproceedings{sesari-etal-2022-empirical,
title = "An Empirical Study on the Fairness of Pre-trained Word Embeddings",
author = "Sesari, Emeralda and
Hort, Max and
Sarro, Federica",
editor = "Hardmeier, Christian and
Basta, Christine and
Costa-juss{\`a}, Marta R. and
Stanovsky, Gabriel and
Gonen, Hila",
booktitle = "Proceedings of the 4th Workshop on Gender Bias in Natural Language Processing (GeBNLP)",
month = jul,
year = "2022",
address = "Seattle, Washington",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.gebnlp-1.15/",
doi = "10.18653/v1/2022.gebnlp-1.15",
pages = "129--144",
abstract = "Pre-trained word embedding models are easily distributed and applied, as they alleviate users from the effort to train models themselves. With widely distributed models, it is important to ensure that they do not exhibit undesired behaviour, such as biases against population groups. For this purpose, we carry out an empirical study on evaluating the bias of 15 publicly available, pre-trained word embeddings model based on three training algorithms (GloVe, word2vec, and fastText) with regard to four bias metrics (WEAT, SEMBIAS,DIRECT BIAS, and ECT). The choice of word embedding models and bias metrics is motivated by a literature survey over 37 publications which quantified bias on pre-trained word embeddings. Our results indicate that fastText is the least biased model (in 8 out of 12 cases) and small vector lengths lead to a higher bias."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sesari-etal-2022-empirical">
<titleInfo>
<title>An Empirical Study on the Fairness of Pre-trained Word Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Emeralda</namePart>
<namePart type="family">Sesari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Max</namePart>
<namePart type="family">Hort</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Federica</namePart>
<namePart type="family">Sarro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Gender Bias in Natural Language Processing (GeBNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Hardmeier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christine</namePart>
<namePart type="family">Basta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Costa-jussà</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hila</namePart>
<namePart type="family">Gonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, Washington</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Pre-trained word embedding models are easily distributed and applied, as they alleviate users from the effort to train models themselves. With widely distributed models, it is important to ensure that they do not exhibit undesired behaviour, such as biases against population groups. For this purpose, we carry out an empirical study on evaluating the bias of 15 publicly available, pre-trained word embeddings model based on three training algorithms (GloVe, word2vec, and fastText) with regard to four bias metrics (WEAT, SEMBIAS,DIRECT BIAS, and ECT). The choice of word embedding models and bias metrics is motivated by a literature survey over 37 publications which quantified bias on pre-trained word embeddings. Our results indicate that fastText is the least biased model (in 8 out of 12 cases) and small vector lengths lead to a higher bias.</abstract>
<identifier type="citekey">sesari-etal-2022-empirical</identifier>
<identifier type="doi">10.18653/v1/2022.gebnlp-1.15</identifier>
<location>
<url>https://aclanthology.org/2022.gebnlp-1.15/</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>129</start>
<end>144</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Empirical Study on the Fairness of Pre-trained Word Embeddings
%A Sesari, Emeralda
%A Hort, Max
%A Sarro, Federica
%Y Hardmeier, Christian
%Y Basta, Christine
%Y Costa-jussà, Marta R.
%Y Stanovsky, Gabriel
%Y Gonen, Hila
%S Proceedings of the 4th Workshop on Gender Bias in Natural Language Processing (GeBNLP)
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, Washington
%F sesari-etal-2022-empirical
%X Pre-trained word embedding models are easily distributed and applied, as they alleviate users from the effort to train models themselves. With widely distributed models, it is important to ensure that they do not exhibit undesired behaviour, such as biases against population groups. For this purpose, we carry out an empirical study on evaluating the bias of 15 publicly available, pre-trained word embeddings model based on three training algorithms (GloVe, word2vec, and fastText) with regard to four bias metrics (WEAT, SEMBIAS,DIRECT BIAS, and ECT). The choice of word embedding models and bias metrics is motivated by a literature survey over 37 publications which quantified bias on pre-trained word embeddings. Our results indicate that fastText is the least biased model (in 8 out of 12 cases) and small vector lengths lead to a higher bias.
%R 10.18653/v1/2022.gebnlp-1.15
%U https://aclanthology.org/2022.gebnlp-1.15/
%U https://doi.org/10.18653/v1/2022.gebnlp-1.15
%P 129-144
Markdown (Informal)
[An Empirical Study on the Fairness of Pre-trained Word Embeddings](https://aclanthology.org/2022.gebnlp-1.15/) (Sesari et al., GeBNLP 2022)
ACL