@InProceedings{chaganty-EtAl:2017:EMNLP2017,
  author    = {Chaganty, Arun  and  Paranjape, Ashwin  and  Liang, Percy  and  Manning, Christopher D.},
  title     = {Importance sampling for unbiased on-demand evaluation of knowledge base population},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1038--1048},
  abstract  = {Knowledge base population (KBP) systems take in a large document corpus and
	extract entities and their relations. Thus far, KBP evaluation has relied on
	judgements on the pooled predictions of existing systems.
	We show that this evaluation is problematic: when a new system predicts a
	previously unseen relation, it is penalized even if it is correct. This leads
	to significant bias against new systems, which counterproductively discourages
	innovation in the field. Our first contribution is a new importance-sampling
	based evaluation which corrects for this bias by annotating a new system's
	predictions on-demand via crowdsourcing. We show this eliminates bias and
	reduces variance using data from the 2015 TAC KBP task. Our second contribution
	is an implementation of our method made publicly available as an online KBP
	evaluation service. We pilot the service by testing diverse state-of-the-art
	systems on the TAC KBP 2016 corpus and obtain accurate scores in a cost
	effective manner.},
  url       = {https://www.aclweb.org/anthology/D17-1109}
}