@InProceedings{almodaresi-EtAl:2017:Short,
  author    = {Almodaresi, Fatemeh  and  Ungar, Lyle  and  Kulkarni, Vivek  and  Zakeri, Mohsen  and  Giorgi, Salvatore  and  Schwartz, H. Andrew},
  title     = {On the Distribution of Lexical Features at Multiple Levels of Analysis},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {79--84},
  abstract  = {Natural language processing has increasingly moved from modeling documents and
	words toward studying the people behind the language. This move to working with
	data at the user or community level has presented the field with different
	characteristics of linguistic data. In this paper, we empirically characterize
	various lexical distributions at different levels of analysis, showing that,
	while most features are decidedly sparse and non-normal at the message-level
	(as with traditional NLP), they follow the central limit theorem to become much
	more Log-normal or even Normal at the user- and county-levels. Finally, we
	demonstrate that modeling lexical features for the correct level of analysis
	leads to marked improvements in common social scientific prediction tasks.},
  url       = {http://aclweb.org/anthology/P17-2013}
}