@inproceedings{lu-etal-2017-adagrad,
title = "Why {ADAGRAD} Fails for Online Topic Modeling",
author = "Lu, You and
Lund, Jeffrey and
Boyd-Graber, Jordan",
editor = "Palmer, Martha and
Hwa, Rebecca and
Riedel, Sebastian",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D17-1046",
doi = "10.18653/v1/D17-1046",
pages = "446--451",
abstract = "Online topic modeling, i.e., topic modeling with stochastic variational inference, is a powerful and efficient technique for analyzing large datasets, and ADAGRAD is a widely-used technique for tuning learning rates during online gradient optimization. However, these two techniques do not work well together. We show that this is because ADAGRAD uses accumulation of previous gradients as the learning rates{'} denominators. For online topic modeling, the magnitude of gradients is very large. It causes learning rates to shrink very quickly, so the parameters cannot fully converge until the training ends",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lu-etal-2017-adagrad">
<titleInfo>
<title>Why ADAGRAD Fails for Online Topic Modeling</title>
</titleInfo>
<name type="personal">
<namePart type="given">You</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jeffrey</namePart>
<namePart type="family">Lund</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Martha</namePart>
<namePart type="family">Palmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rebecca</namePart>
<namePart type="family">Hwa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Riedel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Copenhagen, Denmark</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Online topic modeling, i.e., topic modeling with stochastic variational inference, is a powerful and efficient technique for analyzing large datasets, and ADAGRAD is a widely-used technique for tuning learning rates during online gradient optimization. However, these two techniques do not work well together. We show that this is because ADAGRAD uses accumulation of previous gradients as the learning rates’ denominators. For online topic modeling, the magnitude of gradients is very large. It causes learning rates to shrink very quickly, so the parameters cannot fully converge until the training ends</abstract>
<identifier type="citekey">lu-etal-2017-adagrad</identifier>
<identifier type="doi">10.18653/v1/D17-1046</identifier>
<location>
<url>https://aclanthology.org/D17-1046</url>
</location>
<part>
<date>2017-09</date>
<extent unit="page">
<start>446</start>
<end>451</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Why ADAGRAD Fails for Online Topic Modeling
%A Lu, You
%A Lund, Jeffrey
%A Boyd-Graber, Jordan
%Y Palmer, Martha
%Y Hwa, Rebecca
%Y Riedel, Sebastian
%S Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing
%D 2017
%8 September
%I Association for Computational Linguistics
%C Copenhagen, Denmark
%F lu-etal-2017-adagrad
%X Online topic modeling, i.e., topic modeling with stochastic variational inference, is a powerful and efficient technique for analyzing large datasets, and ADAGRAD is a widely-used technique for tuning learning rates during online gradient optimization. However, these two techniques do not work well together. We show that this is because ADAGRAD uses accumulation of previous gradients as the learning rates’ denominators. For online topic modeling, the magnitude of gradients is very large. It causes learning rates to shrink very quickly, so the parameters cannot fully converge until the training ends
%R 10.18653/v1/D17-1046
%U https://aclanthology.org/D17-1046
%U https://doi.org/10.18653/v1/D17-1046
%P 446-451
Markdown (Informal)
[Why ADAGRAD Fails for Online Topic Modeling](https://aclanthology.org/D17-1046) (Lu et al., EMNLP 2017)
ACL
- You Lu, Jeffrey Lund, and Jordan Boyd-Graber. 2017. Why ADAGRAD Fails for Online Topic Modeling. In Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing, pages 446–451, Copenhagen, Denmark. Association for Computational Linguistics.