@InProceedings{graham-EtAl:2016:COLING,
  author    = {Graham, Yvette  and  Baldwin, Timothy  and  Dowling, Meghan  and  Eskevich, Maria  and  Lynn, Teresa  and  Tounsi, Lamia},
  title     = {Is all that Glitters in Machine Translation Quality Estimation really Gold?},
  booktitle = {Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {3124--3134},
  abstract  = {Human-targeted metrics provide a compromise between human evaluation of machine
	translation, where high inter-annotator agreement is difficult to achieve, and
	fully automatic metrics, such as BLEU or TER, that lack the validity of human
	assessment. Human-targeted translation edit rate (HTER) is by far the most
	widely employed human-targeted metric in machine translation, commonly
	employed, for example, as a gold standard in evaluation of quality estimation.
	Original experiments justifying the design of HTER, as opposed to other
	possible formulations, were limited to a small sample of translations and a
	single language pair, however, and this motivates our re-evaluation of a range
	of human-targeted metrics on a substantially larger scale. Results show
	significantly stronger correlation with human judgment for HBLEU over HTER for
	two of the nine language pairs we include and no significant difference between
	correlations
	achieved by HTER and HBLEU for the remaining language pairs. Finally, we
	evaluate a range of quality estimation systems employing HTER and direct
	assessment (DA) of translation adequacy as gold labels, resulting in a
	divergence in system rankings, and propose employment of DA for future quality
	estimation evaluations.},
  url       = {http://aclweb.org/anthology/C16-1294}
}