@InProceedings{haffari-tran-carman:2017:EACLlong,
  author    = {Haffari, Gholamreza  and  Tran, Tuan Dung  and  Carman, Mark},
  title     = {Efficient Benchmarking of NLP APIs using Multi-armed Bandits},
  booktitle = {Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers},
  month     = {April},
  year      = {2017},
  address   = {Valencia, Spain},
  publisher = {Association for Computational Linguistics},
  pages     = {408--416},
  abstract  = {Comparing NLP  systems to select the best one for a task of interest, such as
	named entity recognition, is critical for practitioners and researchers. A
	rigorous approach involves setting up a hypothesis testing scenario using the
	performance of the systems on query documents. However, often the hypothesis
	testing approach needs to send a lot of document queries to the systems, which
	can be problematic. In this paper, we present an effective alternative based on
	the multi-armed bandit (MAB). We propose a
	hierarchical generative model to represent the uncertainty in the performance
	measures of the competing systems, to be used by Thompson Sampling to solve the
	resulting  MAB. Experimental results on both synthetic and real data show that
	our approach requires significantly fewer queries compared to the standard
	benchmarking technique to identify the best system according to F-measure.},
  url       = {http://www.aclweb.org/anthology/E17-1039}
}