@InProceedings{paetzold-specia:2016:COLING2,
  author    = {Paetzold, Gustavo  and  Specia, Lucia},
  title     = {Collecting and Exploring Everyday Language for Predicting Psycholinguistic Properties of Words},
  booktitle = {Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {1669--1679},
  abstract  = {Exploring language usage through frequency analysis in large corpora is a
	defining feature in most recent work in corpus and computational linguistics.
	From a psycholinguistic perspective, however, the corpora used in these
	contributions are often not representative of language usage: they are either
	domain-specific, limited in size, or extracted from unreliable sources. In an
	effort to address this limitation, we introduce SubIMDB, a corpus of everyday
	language spoken text we created which contains over 225 million words. The
	corpus was extracted from 38,102 subtitles of family, comedy and children
	movies and series, and is the first sizeable structured corpus of subtitles
	made available. Our experiments show that word frequency norms extracted from
	this corpus are more effective than those from well-known norms such as
	Kucera-Francis, HAL and SUBTLEXus in predicting various psycholinguistic
	properties of words, such as lexical decision times, familiarity, age of
	acquisition and simplicity. We also provide evidence that contradict the
	long-standing assumption that the ideal size for a corpus can be determined
	solely based on how well its word frequencies correlate with lexical decision
	times.},
  url       = {http://aclweb.org/anthology/C16-1157}
}

