@inproceedings{gittens-etal-2017-skip,
title = "Skip-Gram − {Z}ipf + Uniform = Vector Additivity",
author = "Gittens, Alex and
Achlioptas, Dimitris and
Mahoney, Michael W.",
editor = "Barzilay, Regina and
Kan, Min-Yen",
booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2017",
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P17-1007",
doi = "10.18653/v1/P17-1007",
pages = "69--76",
abstract = "In recent years word-embedding models have gained great popularity due to their remarkable performance on several tasks, including word analogy questions and caption generation. An unexpected {``}side-effect{''} of such models is that their vectors often exhibit compositionality, i.e., \textit{adding}two word-vectors results in a vector that is only a small angle away from the vector of a word representing the semantic composite of the original words, e.g., {``}man{''} + {``}royal{''} = {``}king{''}. This work provides a theoretical justification for the presence of additive compositionality in word vectors learned using the Skip-Gram model. In particular, it shows that additive compositionality holds in an even stricter sense (small distance rather than small angle) under certain assumptions on the process generating the corpus. As a corollary, it explains the success of vector calculus in solving word analogies. When these assumptions do not hold, this work describes the correct non-linear composition operator. Finally, this work establishes a connection between the Skip-Gram model and the Sufficient Dimensionality Reduction (SDR) framework of Globerson and Tishby: the parameters of SDR models can be obtained from those of Skip-Gram models simply by adding information on symbol frequencies. This shows that Skip-Gram embeddings are optimal in the sense of Globerson and Tishby and, further, implies that the heuristics commonly used to approximately fit Skip-Gram models can be used to fit SDR models.",
}