@inproceedings{wannasuphoprasit-etal-2023-solving,
title = "Solving Cosine Similarity Underestimation between High Frequency Words by $\ell_2$ Norm Discounting",
author = "Wannasuphoprasit, Saeth and
Zhou, Yi and
Bollegala, Danushka",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.550",
doi = "10.18653/v1/2023.findings-acl.550",
pages = "8644--8652",
abstract = "Cosine similarity between two words, computed using their contextualised token embeddings obtained from masked language models (MLMs) such as BERT has shown to underestimate the actual similarity between those words CITATION.This similarity underestimation problem is particularly severe for high frequent words. Although this problem has been noted in prior work, no solution has been proposed thus far. We observe that the $\ell_2$ norm of contextualised embeddings of a word correlates with its log-frequency in the pretraining corpus.Consequently, the larger $\ell_2$ norms associated with the high frequent words reduce the cosine similarity values measured between them, thus underestimating the similarity scores.To solve this issue, we propose a method to \textit{discount} the $\ell_2$ norm of a contextualised word embedding by the frequency of that word in a corpus when measuring the cosine similarities between words.We show that the so called \textit{stop} words behave differently from the rest of the words, which require special consideration during their discounting process.Experimental results on a contextualised word similarity dataset show that our proposed discounting method accurately solves the similarity underestimation problem.An anonymized version of the source code of our proposed method is submitted to the reviewing system.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wannasuphoprasit-etal-2023-solving">
<titleInfo>
<title>Solving Cosine Similarity Underestimation between High Frequency Words by \ell₂ Norm Discounting</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saeth</namePart>
<namePart type="family">Wannasuphoprasit</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danushka</namePart>
<namePart type="family">Bollegala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Cosine similarity between two words, computed using their contextualised token embeddings obtained from masked language models (MLMs) such as BERT has shown to underestimate the actual similarity between those words CITATION.This similarity underestimation problem is particularly severe for high frequent words. Although this problem has been noted in prior work, no solution has been proposed thus far. We observe that the \ell₂ norm of contextualised embeddings of a word correlates with its log-frequency in the pretraining corpus.Consequently, the larger \ell₂ norms associated with the high frequent words reduce the cosine similarity values measured between them, thus underestimating the similarity scores.To solve this issue, we propose a method to discount the \ell₂ norm of a contextualised word embedding by the frequency of that word in a corpus when measuring the cosine similarities between words.We show that the so called stop words behave differently from the rest of the words, which require special consideration during their discounting process.Experimental results on a contextualised word similarity dataset show that our proposed discounting method accurately solves the similarity underestimation problem.An anonymized version of the source code of our proposed method is submitted to the reviewing system.</abstract>
<identifier type="citekey">wannasuphoprasit-etal-2023-solving</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.550</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.550</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>8644</start>
<end>8652</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Solving Cosine Similarity Underestimation between High Frequency Words by \ell₂ Norm Discounting
%A Wannasuphoprasit, Saeth
%A Zhou, Yi
%A Bollegala, Danushka
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F wannasuphoprasit-etal-2023-solving
%X Cosine similarity between two words, computed using their contextualised token embeddings obtained from masked language models (MLMs) such as BERT has shown to underestimate the actual similarity between those words CITATION.This similarity underestimation problem is particularly severe for high frequent words. Although this problem has been noted in prior work, no solution has been proposed thus far. We observe that the \ell₂ norm of contextualised embeddings of a word correlates with its log-frequency in the pretraining corpus.Consequently, the larger \ell₂ norms associated with the high frequent words reduce the cosine similarity values measured between them, thus underestimating the similarity scores.To solve this issue, we propose a method to discount the \ell₂ norm of a contextualised word embedding by the frequency of that word in a corpus when measuring the cosine similarities between words.We show that the so called stop words behave differently from the rest of the words, which require special consideration during their discounting process.Experimental results on a contextualised word similarity dataset show that our proposed discounting method accurately solves the similarity underestimation problem.An anonymized version of the source code of our proposed method is submitted to the reviewing system.
%R 10.18653/v1/2023.findings-acl.550
%U https://aclanthology.org/2023.findings-acl.550
%U https://doi.org/10.18653/v1/2023.findings-acl.550
%P 8644-8652
Markdown (Informal)
[Solving Cosine Similarity Underestimation between High Frequency Words by ℓ2 Norm Discounting](https://aclanthology.org/2023.findings-acl.550) (Wannasuphoprasit et al., Findings 2023)
ACL