@InProceedings{roberts:2016:ClinicalNLP,
  author    = {Roberts, Kirk},
  title     = {Assessing the Corpus Size vs. Similarity Trade-off for Word Embeddings in Clinical NLP},
  booktitle = {Proceedings of the Clinical Natural Language Processing Workshop (ClinicalNLP)},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {54--63},
  abstract  = {The proliferation of deep learning methods in natural language processing (NLP)
	and the large amounts of data they often require stands in stark contrast to
	the relatively data-poor clinical NLP domain. In particular, large text corpora
	are necessary to build high-quality word embeddings, yet often large corpora
	that are suitably representative of the target clinical data are unavailable. 
	This forces a choice between building embeddings from small clinical corpora
	and less representative, larger corpora. This paper explores this trade-off, as
	well as intermediate compromise solutions. Two standard clinical NLP tasks (the
	i2b2 2010 concept and assertion tasks) are evaluated with commonly used deep
	learning models (recurrent neural networks and convolutional neural networks)
	using a set of six corpora ranging from the target i2b2 data to large
	open-domain datasets. While combinations of corpora are generally found to work
	best, the single-best corpus is generally task-dependent.},
  url       = {http://aclweb.org/anthology/W16-4208}
}

