@inproceedings{patel-bhattacharyya-2017-towards,
title = "Towards Lower Bounds on Number of Dimensions for Word Embeddings",
author = "Patel, Kevin and
Bhattacharyya, Pushpak",
editor = "Kondrak, Greg and
Watanabe, Taro",
booktitle = "Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 2: Short Papers)",
month = nov,
year = "2017",
address = "Taipei, Taiwan",
publisher = "Asian Federation of Natural Language Processing",
url = "https://aclanthology.org/I17-2006/",
pages = "31--36",
abstract = "Word embeddings are a relatively new addition to the modern NLP researcher`s toolkit. However, unlike other tools, word embeddings are used in a black box manner. There are very few studies regarding various hyperparameters. One such hyperparameter is the dimension of word embeddings. They are rather decided based on a rule of thumb: in the range 50 to 300. In this paper, we show that the dimension should instead be chosen based on corpus statistics. More specifically, we show that the number of pairwise equidistant words of the corpus vocabulary (as defined by some distance/similarity metric) gives a lower bound on the the number of dimensions , and going below this bound results in degradation of quality of learned word embeddings. Through our evaluations on standard word embedding evaluation tasks, we show that for dimensions higher than or equal to the bound, we get better results as compared to the ones below it."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="patel-bhattacharyya-2017-towards">
<titleInfo>
<title>Towards Lower Bounds on Number of Dimensions for Word Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Patel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Greg</namePart>
<namePart type="family">Kondrak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Taro</namePart>
<namePart type="family">Watanabe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Asian Federation of Natural Language Processing</publisher>
<place>
<placeTerm type="text">Taipei, Taiwan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Word embeddings are a relatively new addition to the modern NLP researcher‘s toolkit. However, unlike other tools, word embeddings are used in a black box manner. There are very few studies regarding various hyperparameters. One such hyperparameter is the dimension of word embeddings. They are rather decided based on a rule of thumb: in the range 50 to 300. In this paper, we show that the dimension should instead be chosen based on corpus statistics. More specifically, we show that the number of pairwise equidistant words of the corpus vocabulary (as defined by some distance/similarity metric) gives a lower bound on the the number of dimensions , and going below this bound results in degradation of quality of learned word embeddings. Through our evaluations on standard word embedding evaluation tasks, we show that for dimensions higher than or equal to the bound, we get better results as compared to the ones below it.</abstract>
<identifier type="citekey">patel-bhattacharyya-2017-towards</identifier>
<location>
<url>https://aclanthology.org/I17-2006/</url>
</location>
<part>
<date>2017-11</date>
<extent unit="page">
<start>31</start>
<end>36</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Lower Bounds on Number of Dimensions for Word Embeddings
%A Patel, Kevin
%A Bhattacharyya, Pushpak
%Y Kondrak, Greg
%Y Watanabe, Taro
%S Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 2: Short Papers)
%D 2017
%8 November
%I Asian Federation of Natural Language Processing
%C Taipei, Taiwan
%F patel-bhattacharyya-2017-towards
%X Word embeddings are a relatively new addition to the modern NLP researcher‘s toolkit. However, unlike other tools, word embeddings are used in a black box manner. There are very few studies regarding various hyperparameters. One such hyperparameter is the dimension of word embeddings. They are rather decided based on a rule of thumb: in the range 50 to 300. In this paper, we show that the dimension should instead be chosen based on corpus statistics. More specifically, we show that the number of pairwise equidistant words of the corpus vocabulary (as defined by some distance/similarity metric) gives a lower bound on the the number of dimensions , and going below this bound results in degradation of quality of learned word embeddings. Through our evaluations on standard word embedding evaluation tasks, we show that for dimensions higher than or equal to the bound, we get better results as compared to the ones below it.
%U https://aclanthology.org/I17-2006/
%P 31-36
Markdown (Informal)
[Towards Lower Bounds on Number of Dimensions for Word Embeddings](https://aclanthology.org/I17-2006/) (Patel & Bhattacharyya, IJCNLP 2017)
ACL