@inproceedings{valentini-etal-2023-investigating,
title = "Investigating the Frequency Distortion of Word Embeddings and Its Impact on Bias Metrics",
author = "Valentini, Francisco and
Sosa, Juan and
Slezak, Diego and
Altszyler, Edgar",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.9",
doi = "10.18653/v1/2023.findings-emnlp.9",
pages = "113--126",
abstract = "Recent research has shown that static word embeddings can encode words{'} frequencies. However, little has been studied about this behavior. In the present work, we study how frequency and semantic similarity relate to one another in static word embeddings, and we assess the impact of this relationship on embedding-based bias metrics. We find that Skip-gram, GloVe and FastText embeddings tend to produce higher similarity between high-frequency words than between other frequency combinations. We show that the association between frequency and similarity also appears when words are randomly shuffled, and holds for different hyperparameter settings. This proves that the patterns we find are neither due to real semantic associations nor to specific parameters choices, and are an artifact produced by the word embeddings. To illustrate how frequencies can affect the measurement of biases related to gender, ethnicity, and affluence, we carry out a controlled experiment that shows that biases can even change sign or reverse their order when word frequencies change.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="valentini-etal-2023-investigating">
<titleInfo>
<title>Investigating the Frequency Distortion of Word Embeddings and Its Impact on Bias Metrics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Francisco</namePart>
<namePart type="family">Valentini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Sosa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diego</namePart>
<namePart type="family">Slezak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edgar</namePart>
<namePart type="family">Altszyler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent research has shown that static word embeddings can encode words’ frequencies. However, little has been studied about this behavior. In the present work, we study how frequency and semantic similarity relate to one another in static word embeddings, and we assess the impact of this relationship on embedding-based bias metrics. We find that Skip-gram, GloVe and FastText embeddings tend to produce higher similarity between high-frequency words than between other frequency combinations. We show that the association between frequency and similarity also appears when words are randomly shuffled, and holds for different hyperparameter settings. This proves that the patterns we find are neither due to real semantic associations nor to specific parameters choices, and are an artifact produced by the word embeddings. To illustrate how frequencies can affect the measurement of biases related to gender, ethnicity, and affluence, we carry out a controlled experiment that shows that biases can even change sign or reverse their order when word frequencies change.</abstract>
<identifier type="citekey">valentini-etal-2023-investigating</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.9</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.9</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>113</start>
<end>126</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Investigating the Frequency Distortion of Word Embeddings and Its Impact on Bias Metrics
%A Valentini, Francisco
%A Sosa, Juan
%A Slezak, Diego
%A Altszyler, Edgar
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F valentini-etal-2023-investigating
%X Recent research has shown that static word embeddings can encode words’ frequencies. However, little has been studied about this behavior. In the present work, we study how frequency and semantic similarity relate to one another in static word embeddings, and we assess the impact of this relationship on embedding-based bias metrics. We find that Skip-gram, GloVe and FastText embeddings tend to produce higher similarity between high-frequency words than between other frequency combinations. We show that the association between frequency and similarity also appears when words are randomly shuffled, and holds for different hyperparameter settings. This proves that the patterns we find are neither due to real semantic associations nor to specific parameters choices, and are an artifact produced by the word embeddings. To illustrate how frequencies can affect the measurement of biases related to gender, ethnicity, and affluence, we carry out a controlled experiment that shows that biases can even change sign or reverse their order when word frequencies change.
%R 10.18653/v1/2023.findings-emnlp.9
%U https://aclanthology.org/2023.findings-emnlp.9
%U https://doi.org/10.18653/v1/2023.findings-emnlp.9
%P 113-126
Markdown (Informal)
[Investigating the Frequency Distortion of Word Embeddings and Its Impact on Bias Metrics](https://aclanthology.org/2023.findings-emnlp.9) (Valentini et al., Findings 2023)
ACL