@inproceedings{aji-heafield-2019-making,
title = "Making Asynchronous Stochastic Gradient Descent Work for Transformers",
author = "Aji, Alham Fikri and
Heafield, Kenneth",
editor = "Birch, Alexandra and
Finch, Andrew and
Hayashi, Hiroaki and
Konstas, Ioannis and
Luong, Thang and
Neubig, Graham and
Oda, Yusuke and
Sudoh, Katsuhito",
booktitle = "Proceedings of the 3rd Workshop on Neural Generation and Translation",
month = nov,
year = "2019",
address = "Hong Kong",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D19-5608",
doi = "10.18653/v1/D19-5608",
pages = "80--89",
abstract = "Asynchronous stochastic gradient descent (SGD) converges poorly for Transformer models, so synchronous SGD has become the norm for Transformer training. This is unfortunate because asynchronous SGD is faster at raw training speed since it avoids waiting for synchronization. Moreover, the Transformer model is the basis for state-of-the-art models for several tasks, including machine translation, so training speed matters. To understand why asynchronous SGD under-performs, we blur the lines between asynchronous and synchronous methods. We find that summing several asynchronous updates, rather than applying them immediately, restores convergence behavior. With this method, the Transformer attains the same BLEU score 1.36 times as fast.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="aji-heafield-2019-making">
<titleInfo>
<title>Making Asynchronous Stochastic Gradient Descent Work for Transformers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alham</namePart>
<namePart type="given">Fikri</namePart>
<namePart type="family">Aji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenneth</namePart>
<namePart type="family">Heafield</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Workshop on Neural Generation and Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alexandra</namePart>
<namePart type="family">Birch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrew</namePart>
<namePart type="family">Finch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroaki</namePart>
<namePart type="family">Hayashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ioannis</namePart>
<namePart type="family">Konstas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thang</namePart>
<namePart type="family">Luong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Graham</namePart>
<namePart type="family">Neubig</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yusuke</namePart>
<namePart type="family">Oda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katsuhito</namePart>
<namePart type="family">Sudoh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hong Kong</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Asynchronous stochastic gradient descent (SGD) converges poorly for Transformer models, so synchronous SGD has become the norm for Transformer training. This is unfortunate because asynchronous SGD is faster at raw training speed since it avoids waiting for synchronization. Moreover, the Transformer model is the basis for state-of-the-art models for several tasks, including machine translation, so training speed matters. To understand why asynchronous SGD under-performs, we blur the lines between asynchronous and synchronous methods. We find that summing several asynchronous updates, rather than applying them immediately, restores convergence behavior. With this method, the Transformer attains the same BLEU score 1.36 times as fast.</abstract>
<identifier type="citekey">aji-heafield-2019-making</identifier>
<identifier type="doi">10.18653/v1/D19-5608</identifier>
<location>
<url>https://aclanthology.org/D19-5608</url>
</location>
<part>
<date>2019-11</date>
<extent unit="page">
<start>80</start>
<end>89</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Making Asynchronous Stochastic Gradient Descent Work for Transformers
%A Aji, Alham Fikri
%A Heafield, Kenneth
%Y Birch, Alexandra
%Y Finch, Andrew
%Y Hayashi, Hiroaki
%Y Konstas, Ioannis
%Y Luong, Thang
%Y Neubig, Graham
%Y Oda, Yusuke
%Y Sudoh, Katsuhito
%S Proceedings of the 3rd Workshop on Neural Generation and Translation
%D 2019
%8 November
%I Association for Computational Linguistics
%C Hong Kong
%F aji-heafield-2019-making
%X Asynchronous stochastic gradient descent (SGD) converges poorly for Transformer models, so synchronous SGD has become the norm for Transformer training. This is unfortunate because asynchronous SGD is faster at raw training speed since it avoids waiting for synchronization. Moreover, the Transformer model is the basis for state-of-the-art models for several tasks, including machine translation, so training speed matters. To understand why asynchronous SGD under-performs, we blur the lines between asynchronous and synchronous methods. We find that summing several asynchronous updates, rather than applying them immediately, restores convergence behavior. With this method, the Transformer attains the same BLEU score 1.36 times as fast.
%R 10.18653/v1/D19-5608
%U https://aclanthology.org/D19-5608
%U https://doi.org/10.18653/v1/D19-5608
%P 80-89
Markdown (Informal)
[Making Asynchronous Stochastic Gradient Descent Work for Transformers](https://aclanthology.org/D19-5608) (Aji & Heafield, NGT 2019)
ACL