@InProceedings{malmasi-dras:2017:Short,
  author    = {Malmasi, Shervin  and  Dras, Mark},
  title     = {Feature Hashing for Language and Dialect Identification},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {399--403},
  abstract  = {We evaluate feature hashing for language identification (LID), a method not
	previously used for this task. Using a standard dataset, we first show that
	while feature performance is high, LID data is highly dimensional and mostly
	sparse (>99.5%) as it includes large vocabularies for many languages; memory
	requirements grow as languages are added. Next we apply hashing using various
	hash sizes, demonstrating that there is no performance loss with dimensionality
	reductions of up to 86%. We also show that using an ensemble of low-dimension
	hash-based classifiers further boosts performance. Feature hashing is highly
	useful for LID and holds great promise for future work in this area.},
  url       = {http://aclweb.org/anthology/P17-2063}
}

