@InProceedings{stahlberg-byrne:2017:EMNLP2017,
  author    = {Stahlberg, Felix  and  Byrne, Bill},
  title     = {Unfolding and Shrinking Neural Machine Translation Ensembles},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1946--1956},
  abstract  = {Ensembling is a well-known technique in neural machine translation (NMT) to
	improve system performance. Instead of a single neural net, multiple neural
	nets with the same topology are trained separately, and the decoder generates
	predictions by averaging over the individual models. Ensembling often improves
	the quality of the generated translations drastically. However, it is not
	suitable for production systems because it is cumbersome and slow. This work
	aims to reduce the runtime to be on par with a single system without
	compromising the translation quality. First, we show that the ensemble can be
	unfolded into a single large neural network which imitates the output of the
	ensemble system. We show that unfolding can already improve the runtime in
	practice since more work can be done on the GPU. We proceed by describing a set
	of techniques to shrink the unfolded network by reducing the dimensionality of
	layers. On Japanese-English we report that the resulting network has the size
	and decoding speed of a single NMT network but performs on the level of a
	3-ensemble system.},
  url       = {https://www.aclweb.org/anthology/D17-1208}
}