@InProceedings{aji-heafield:2017:EMNLP2017,
  author    = {Aji, Alham Fikri  and  Heafield, Kenneth},
  title     = {Sparse Communication for Distributed Gradient Descent},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {440--445},
  abstract  = {We make distributed stochastic gradient descent faster by exchanging sparse
	updates instead of dense updates. Gradient updates are positively skewed as
	most updates are near zero, so we map the 99% smallest updates (by absolute
	value) to zero then exchange sparse matrices. This method can be combined with
	quantization to further improve the compression. We explore different
	configurations and apply them to neural machine translation and MNIST image
	classification tasks. Most configurations work on MNIST, whereas different
	configurations reduce convergence rate on the more complex translation task.
	Our experiments show that we can achieve up to 49% speed up on MNIST and 22% on
	NMT without damaging the final accuracy or BLEU.},
  url       = {https://www.aclweb.org/anthology/D17-1045}
}