@inproceedings{aji-heafield-2017-sparse,
title = "Sparse Communication for Distributed Gradient Descent",
author = "Aji, Alham Fikri and
Heafield, Kenneth",
editor = "Palmer, Martha and
Hwa, Rebecca and
Riedel, Sebastian",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D17-1045",
doi = "10.18653/v1/D17-1045",
pages = "440--445",
abstract = "We make distributed stochastic gradient descent faster by exchanging sparse updates instead of dense updates. Gradient updates are positively skewed as most updates are near zero, so we map the 99{\%} smallest updates (by absolute value) to zero then exchange sparse matrices. This method can be combined with quantization to further improve the compression. We explore different configurations and apply them to neural machine translation and MNIST image classification tasks. Most configurations work on MNIST, whereas different configurations reduce convergence rate on the more complex translation task. Our experiments show that we can achieve up to 49{\%} speed up on MNIST and 22{\%} on NMT without damaging the final accuracy or BLEU.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="aji-heafield-2017-sparse">
<titleInfo>
<title>Sparse Communication for Distributed Gradient Descent</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alham</namePart>
<namePart type="given">Fikri</namePart>
<namePart type="family">Aji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenneth</namePart>
<namePart type="family">Heafield</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Martha</namePart>
<namePart type="family">Palmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rebecca</namePart>
<namePart type="family">Hwa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Riedel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Copenhagen, Denmark</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We make distributed stochastic gradient descent faster by exchanging sparse updates instead of dense updates. Gradient updates are positively skewed as most updates are near zero, so we map the 99% smallest updates (by absolute value) to zero then exchange sparse matrices. This method can be combined with quantization to further improve the compression. We explore different configurations and apply them to neural machine translation and MNIST image classification tasks. Most configurations work on MNIST, whereas different configurations reduce convergence rate on the more complex translation task. Our experiments show that we can achieve up to 49% speed up on MNIST and 22% on NMT without damaging the final accuracy or BLEU.</abstract>
<identifier type="citekey">aji-heafield-2017-sparse</identifier>
<identifier type="doi">10.18653/v1/D17-1045</identifier>
<location>
<url>https://aclanthology.org/D17-1045</url>
</location>
<part>
<date>2017-09</date>
<extent unit="page">
<start>440</start>
<end>445</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Sparse Communication for Distributed Gradient Descent
%A Aji, Alham Fikri
%A Heafield, Kenneth
%Y Palmer, Martha
%Y Hwa, Rebecca
%Y Riedel, Sebastian
%S Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing
%D 2017
%8 September
%I Association for Computational Linguistics
%C Copenhagen, Denmark
%F aji-heafield-2017-sparse
%X We make distributed stochastic gradient descent faster by exchanging sparse updates instead of dense updates. Gradient updates are positively skewed as most updates are near zero, so we map the 99% smallest updates (by absolute value) to zero then exchange sparse matrices. This method can be combined with quantization to further improve the compression. We explore different configurations and apply them to neural machine translation and MNIST image classification tasks. Most configurations work on MNIST, whereas different configurations reduce convergence rate on the more complex translation task. Our experiments show that we can achieve up to 49% speed up on MNIST and 22% on NMT without damaging the final accuracy or BLEU.
%R 10.18653/v1/D17-1045
%U https://aclanthology.org/D17-1045
%U https://doi.org/10.18653/v1/D17-1045
%P 440-445
Markdown (Informal)
[Sparse Communication for Distributed Gradient Descent](https://aclanthology.org/D17-1045) (Aji & Heafield, EMNLP 2017)
ACL
- Alham Fikri Aji and Kenneth Heafield. 2017. Sparse Communication for Distributed Gradient Descent. In Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing, pages 440–445, Copenhagen, Denmark. Association for Computational Linguistics.