@InProceedings{barnes-klinger-schulteimwalde:2017:WASSA2017,
  author    = {Barnes, Jeremy  and  Klinger, Roman  and  Schulte im Walde, Sabine},
  title     = {Assessing State-of-the-Art Sentiment Models on State-of-the-Art Sentiment Datasets},
  booktitle = {Proceedings of the 8th Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2--12},
  abstract  = {There has been a good amount of progress in sentiment analysis over
	  the past 10 years, including the proposal of new methods and the
	  creation of benchmark datasets. In some papers, however, there is a
	  tendency to compare models only on one or two datasets, either
	  because of time restraints or because the model is tailored to a
	  specific task. Accordingly, it is hard to understand how well a
	  certain model generalizes across different tasks and datasets. In
	  this paper, we contribute to this situation by comparing several
	  models on six different benchmarks, which belong to different
	  domains and additionally have different levels of granularity
	  (binary, 3-class, 4-class and 5-class). We show that Bi-LSTMs
	  perform well across datasets and that both LSTMs and Bi-LSTMs are
	  particularly good at fine-grained sentiment tasks (\ie, with more
	  than two classes). Incorporating sentiment information
	  into word embeddings during training gives good results for datasets
	  that are lexically similar to the training data. With our
	  experiments, we contribute to a better understanding of the
	  performance of different model architectures on different data
	  sets. Consequently, we detect
	  novel state-of-the-art results on the \textit{SenTube} datasets.},
  url       = {http://www.aclweb.org/anthology/W17-5202}
}