@InProceedings{zhang-EtAl:2017:EMNLP20172,
  author    = {Zhang, Xiaowei  and  Chen, Wei  and  Wang, Feng  and  Xu, Shuang  and  Xu, Bo},
  title     = {Towards Compact and Fast Neural Machine Translation Using a Combined Method},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1475--1481},
  abstract  = {Neural Machine Translation (NMT) lays intensive burden on computation and
	memory cost. It is a challenge to deploy NMT models on the devices with limited
	computation and memory budgets. This paper presents a four stage pipeline to
	compress model and speed up the decoding for NMT. Our method first introduces a
	compact architecture based on convolutional encoder and weight shared
	embeddings. Then weight pruning is applied to obtain a sparse model. Next, we
	propose a fast sequence interpolation approach which enables the greedy
	decoding to achieve performance on par with the beam search. Hence, the
	time-consuming beam search can be replaced by simple greedy decoding. Finally,
	vocabulary selection is used to reduce the computation of softmax layer. Our
	final model achieves 10 times speedup, 17 times parameters reduction, less than
	35MB storage size and comparable performance compared to the baseline model.},
  url       = {https://www.aclweb.org/anthology/D17-1154}
}

