@InProceedings{devlin:2017:EMNLP2017,
  author    = {Devlin, Jacob},
  title     = {Sharp Models on Dull Hardware: Fast and Accurate Neural Machine Translation Decoding on the CPU},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2820--2825},
  abstract  = {Attentional sequence-to-sequence models have become the new standard for
	machine translation, but one challenge of such models is a significant increase
	in training and decoding cost compared to phrase-based systems. In this work we
	focus on efficient decoding, with a goal of achieving accuracy close the
	state-of-the-art in neural machine translation (NMT), while achieving CPU
	decoding speed/throughput close to that of a phrasal decoder.
	We approach this problem from two angles: First, we describe several techniques
	for speeding up an NMT beam search decoder, which obtain a 4.4x speedup over a
	very efficient baseline decoder without changing the decoder output. Second, we
	propose a simple but powerful network architecture which uses an RNN (GRU/LSTM)
	layer at bottom, followed by a series of stacked fully-connected layers applied
	at every timestep. This architecture achieves similar accuracy to a deep
	recurrent model, at a small fraction of the training and decoding cost. By
	combining these techniques, our best system achieves a very competitive
	accuracy of 38.3 BLEU on WMT English-French NewsTest2014, while decoding at 100
	words/sec on single-threaded CPU. We believe this is the best published
	accuracy/speed trade-off of an NMT system.},
  url       = {https://www.aclweb.org/anthology/D17-1300}
}