@InProceedings{pierrehumbert-granell:2018:SIGMORPHON,
  author    = {Pierrehumbert, Janet  and  Granell, Ramon},
  title     = {On Hapax Legomena and Morphological Productivity},
  booktitle = {Proceedings of the Fifteenth Workshop on Computational Research in Phonetics, Phonology, and Morphology},
  month     = {October},
  year      = {2018},
  address   = {Brussels, Belgium},
  publisher = {Association for Computational Linguistics},
  pages     = {125--130},
  abstract  = {Quantifying and predicting morphological productivity is a long-standing challenge in corpus linguistics and psycholinguistics. The same challenge reappears in natural language processing in the context of handling words that were not seen in the training set (out-of-vocabulary, or OOV, words). Prior research showed that a good indicator of the productivity of a morpheme is the number of words involving it that occur exactly once (the hapax legomena). A technical connection was adduced between this result and Good-Turing smoothing, which assigns probability mass to unseen events on the basis of the simplifying assumption that word frequencies are stationary. In a large-scale study of 133 affixes in Wikipedia, we develop evidence that success in fact depends on tapping the frequency range in which the assumptions of Good-Turing are violated.},
  url       = {http://www.aclweb.org/anthology/W18-5814}
}

