@InProceedings{nguyen-daumeiii-boydgraber:2017:EMNLP2017,
  author    = {Nguyen, Khanh  and  Daum\'{e} III, Hal  and  Boyd-Graber, Jordan},
  title     = {Reinforcement Learning for Bandit Neural Machine Translation with Simulated Human Feedback},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1464--1474},
  abstract  = {Machine translation is a natural candidate
	problem for reinforcement learning from
	human feedback: users provide quick,
	dirty ratings on candidate translations to
	guide a system to improve. Yet, current
	neural machine translation training focuses
	on expensive human-generated reference
	translations. We describe a reinforcement
	learning algorithm that improves
	neural machine translation systems
	from simulated human feedback.
	Our algorithm combines the advantage
	actor-critic algorithm (Mnih et al., 2016)
	with the attention-based neural encoder-decoder
	architecture (Luong et al., 2015).
	This algorithm (a) is well-designed for
	problems with a large action space and
	delayed rewards, (b) effectively optimizes
	traditional corpus-level machine translation
	metrics, and (c) is robust to skewed,
	high-variance, granular feedback modeled
	after actual human behaviors.},
  url       = {https://www.aclweb.org/anthology/D17-1153}
}