@InProceedings{lawrence-sokolov-riezler:2017:EMNLP2017,
  author    = {Lawrence, Carolin  and  Sokolov, Artem  and  Riezler, Stefan},
  title     = {Counterfactual Learning from Bandit Feedback under Deterministic Logging : A Case Study in Statistical Machine Translation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2566--2576},
  abstract  = {The goal of counterfactual learning for statistical machine translation (SMT)
	is to optimize a target SMT system from logged data that consist of user
	feedback to translations that were predicted by another, historic SMT system. A
	challenge arises by the fact that risk-averse commercial SMT systems
	deterministically log the most probable translation. The lack of sufficient
	exploration of the SMT output space seemingly contradicts the theoretical
	requirements for counterfactual learning. We show that counterfactual learning
	from deterministic bandit logs is possible nevertheless by smoothing out
	deterministic components in learning. This can be achieved by additive and
	multiplicative control variates that avoid degenerate behavior in empirical
	risk minimization. Our simulation experiments show improvements of up to 2 BLEU
	points by counterfactual learning from deterministic bandit feedback.},
  url       = {https://www.aclweb.org/anthology/D17-1272}
}

