@InProceedings{graham-EtAl:2017:EACLshort,
  author    = {Graham, Yvette  and  Ma, Qingsong  and  Baldwin, Timothy  and  Liu, Qun  and  Parra, Carla  and  Scarton, Carolina},
  title     = {Improving Evaluation of Document-level Machine Translation Quality Estimation},
  booktitle = {Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers},
  month     = {April},
  year      = {2017},
  address   = {Valencia, Spain},
  publisher = {Association for Computational Linguistics},
  pages     = {356--361},
  abstract  = {Meaningful conclusions about the relative performance of NLP systems are only
	possible if the gold standard employed in a given evaluation is both valid and
	reliable. In this paper, we explore the validity of human annotations currently
	employed in the evaluation of document-level quality estimation for machine
	translation (MT). We demonstrate the degree to which MT system rankings are
	dependent on weights employed in the construction of the gold standard, before
	proposing direct human assessment as a valid alternative.
	Experiments show direct assessment (DA) scores for documents to be highly
	reliable, achieving a correlation of above 0.9 in a self-replication
	experiment, in addition to a substantial estimated cost reduction through
	quality
	controlled crowd-sourcing. The original gold standard based on post-edits
	incurs
	a 10--20 times greater cost than DA.},
  url       = {http://www.aclweb.org/anthology/E17-2057}
}