@InProceedings{novikova-EtAl:2017:EMNLP2017,
  author    = {Novikova, Jekaterina  and  Du\v{s}ek, Ond\v{r}ej  and  Cercas Curry, Amanda  and  Rieser, Verena},
  title     = {Why We Need New Evaluation Metrics for NLG},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2241--2252},
  abstract  = {The majority of NLG evaluation relies on automatic metrics, such as BLEU . In
	this paper, we motivate the need for novel, system- and data-independent
	automatic evaluation methods: We investigate a wide range of metrics, including
	state-of-the-art word-based and novel grammar-based ones, and demonstrate that
	they only weakly reflect human judgements of system outputs as generated by
	data-driven, end-to-end NLG. We also show that metric performance is data- and
	system-specific. Nevertheless, our results also suggest that automatic metrics
	perform reliably at system-level and can support system development by finding
	cases where a system performs poorly.},
  url       = {https://www.aclweb.org/anthology/D17-1238}
}

