@InProceedings{singh-EtAl:2016:COLING1,
  author    = {Singh, Mittul  and  Greenberg, Clayton  and  Oualil, Youssef  and  Klakow, Dietrich},
  title     = {Sub-Word Similarity based Search for Embeddings: Inducing Rare-Word Embeddings for Word Similarity Tasks and Language Modelling},
  booktitle = {Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {2061--2070},
  abstract  = {Training good word embeddings requires large amounts of data. Out-of-vocabulary
	words will still be encountered at test-time, leaving these words without
	embeddings. To overcome this lack of embeddings for rare words, existing
	methods leverage morphological features to generate embeddings. While the
	existing methods use computationally-intensive rule-based (Soricut and Och,
	2015) or tool-based (Botha and Blunsom, 2014) morphological analysis to
	generate embeddings, our system applies a computationally-simpler sub-word
	search on words that have existing embeddings. Embeddings of the sub-word
	search results are then combined using string similarity functions to generate
	rare word embeddings. We augmented pre-trained word embeddings with these novel
	embeddings and evaluated on a rare word similarity task, obtaining up to 3
	times improvement in correlation over the original set of embeddings. Applying
	our technique to embeddings trained on larger datasets led to on-par
	performance with the existing state-of-the-art for this task. Additionally,
	while analysing augmented embeddings in a log-bilinear language model, we
	observed up to 50% reduction in rare word perplexity in comparison to other
	more complex language models.},
  url       = {http://aclweb.org/anthology/C16-1194}
}

