@InProceedings{reimers-gurevych:2017:EMNLP2017,
  author    = {Reimers, Nils  and  Gurevych, Iryna},
  title     = {Reporting Score Distributions Makes a Difference: Performance Study of LSTM-networks for Sequence Tagging},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {338--348},
  abstract  = {In this paper we show that reporting a single performance score is insufficient
	to compare non-deterministic approaches. We demonstrate for common sequence
	tagging tasks that the seed value for the random number generator can result in
	statistically significant p < 10\^{}-4 differences for state-of-the-art systems.
	For two recent systems for NER, we observe an absolute difference of one
	percentage point F\_1-score depending on the selected seed value, making these
	systems perceived either as state-of-the-art or mediocre. Instead of publishing
	and reporting single performance scores, we propose to compare score
	distributions based on multiple executions. 
	Based on the evaluation of 50.000 LSTM-networks for five sequence tagging
	tasks, we present network architectures that produce both superior performance
	as well as are more stable with respect to the remaining hyperparameters.},
  url       = {https://www.aclweb.org/anthology/D17-1035}
}