@InProceedings{lowe-EtAl:2017:Long,
  author    = {Lowe, Ryan  and  Noseworthy, Michael  and  Serban, Iulian Vlad  and  Angelard-Gontier, Nicolas  and  Bengio, Yoshua  and  Pineau, Joelle},
  title     = {Towards an Automatic Turing Test: Learning to Evaluate Dialogue Responses},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1116--1126},
  abstract  = {Automatically evaluating the quality of dialogue responses for unstructured
	domains is a challenging problem.  Unfortunately, existing automatic evaluation
	metrics are biased and correlate very poorly with human judgements of response
	quality (Liu et al., 2016). Yet having an accurate automatic evaluation
	procedure is crucial for dialogue research, as it allows rapid prototyping and
	testing of new models with fewer expensive human evaluations. In response to
	this challenge, we formulate automatic dialogue evaluation as a learning
	problem.We present an evaluation model (ADEM)that learns to predict human-like
	scores to input responses, using a new dataset of human response scores.   We
	show that the ADEM model’s predictions correlate significantly,  and at a
	level much higher than word-overlap metrics such as BLEU, with human judgements
	at both the utterance and system-level. We also show that ADEM can generalize
	to evaluating dialogue mod-els unseen during training,                    an
	important step
	for
	automatic dialogue evaluation.},
  url       = {http://aclweb.org/anthology/P17-1103}
}

