@InProceedings{dernoncourt-lee:2017:I17-2,
  author    = {Dernoncourt, Franck  and  Lee, Ji Young},
  title     = {PubMed 200k RCT: a Dataset for Sequential Sentence Classification in Medical Abstracts},
  booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 2: Short Papers)},
  month     = {November},
  year      = {2017},
  address   = {Taipei, Taiwan},
  publisher = {Asian Federation of Natural Language Processing},
  pages     = {308--313},
  abstract  = {We present PubMed 200k RCT, a new dataset based on PubMed for sequential
	sentence classification. The dataset consists of approximately 200,000
	abstracts of randomized controlled trials, totaling 2.3 million sentences. Each
	sentence of each abstract is labeled with their role in the abstract using one
	of the following classes: background, objective, method, result, or conclusion.
	The purpose of releasing this dataset is twofold. First, the majority of
	datasets for sequential short-text classification (i.e., classification of
	short texts that appear in sequences) are small: we hope that releasing a new
	large dataset will help develop more accurate algorithms for this task. Second,
	from an application perspective, researchers need better tools to efficiently
	skim through the literature. Automatically classifying each sentence in an
	abstract would help researchers read abstracts more efficiently, especially in
	fields where abstracts may be long, such as the medical field.},
  url       = {http://www.aclweb.org/anthology/I17-2052}
}

