@InProceedings{sarker-gonzalez:2016:BioTxtM2016,
  author    = {Sarker, Abeed  and  Gonzalez, Graciela},
  title     = {Data, tools and resources for mining social media drug chatter},
  booktitle = {Proceedings of the Fifth Workshop on Building and Evaluating Resources for Biomedical Text Mining (BioTxtM2016)},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {99--107},
  abstract  = {Social media has emerged into a crucial resource for obtaining population-based
	signals for various public health monitoring and surveillance tasks, such as
	pharmacovigilance. There is an abundance of knowledge hidden within social
	media data, and the volume is growing. Drug-related chatter on social media can
	include user-generated information that can provide insights into public health
	problems such as abuse, adverse reactions, long-term effects, and multi-drug
	interactions. Our objective in this paper is to present to the biomedical
	natural language processing, data science, and public health communities data
	sets (annotated and unannotated), tools and resources that we have collected
	and created from social media. The data we present was collected from Twitter
	using the generic and brand names of drugs as keywords, along with their common
	misspellings. Following the collection of the data, annotation guidelines were
	created over several iterations, which detail important aspects of social media
	data annotation and can be used by future researchers for developing similar
	data sets. The annotation guidelines were followed to prepare data sets for
	text classification, information extraction and normalization. In this paper,
	we discuss the preparation of these guidelines, outline the data sets prepared,
	and present an overview of our state-of-the-art systems for data collection,
	supervised classification, and information extraction. In addition to the
	development of supervised systems for classification and extraction, we
	developed and released unlabeled data and language models. We discuss the
	potential uses of these language models in data mining and the large volumes of
	unlabeled data from which they were generated. We believe that the summaries
	and repositories we present here of our data, annotation guidelines, models,
	and tools will be beneficial to the research community as a single-point entry
	for all these resources, and will promote further research in this area.},
  url       = {http://aclweb.org/anthology/W16-5111}
}

