@InProceedings{ma-EtAl:2017:EMNLP2017,
  author    = {Ma, Qingsong  and  Graham, Yvette  and  Baldwin, Timothy  and  Liu, Qun},
  title     = {Further Investigation into Reference Bias in Monolingual Evaluation of Machine Translation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2476--2485},
  abstract  = {Monolingual evaluation of Machine Translation (MT) aims to simplify human
	assessment by requiring assessors to compare the meaning of the MT output with
	a reference
	translation, opening up the task to a much larger pool of genuinely qualified
	evaluators. Monolingual evaluation runs the risk, however, of bias in favour of
	MT systems that happen to produce translations superficially similar to the
	reference and, consistent with this intuition, previous investigations have
	concluded monolingual assessment to be strongly biased in this respect. On
	re-examination of past analyses, we identify a series of potential analytical
	errors that force some important questions to be raised about the reliability
	of past conclusions, however. We subsequently carry out further investigation
	into reference bias via direct human assessment of MT adequacy via quality
	controlled crowd-sourcing. Contrary to both intuition and past conclusions,
	results for show no significant evidence of reference bias
	in monolingual evaluation of MT.},
  url       = {https://www.aclweb.org/anthology/D17-1262}
}

