@InProceedings{lu-lund-boydgraber:2017:EMNLP2017,
  author    = {Lu, You  and  Lund, Jeffrey  and  Boyd-Graber, Jordan},
  title     = {Why ADAGRAD Fails for Online Topic Modeling},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {446--451},
  abstract  = {Online topic modeling, i.e., topic modeling
	with stochastic variational inference, is a
	powerful and efficient technique for analyzing
	large datasets, and ADAGRAD is a
	widely-used technique for tuning learning
	rates during online gradient optimization.
	However, these two techniques do not work
	well together. We show that this is because
	ADAGRAD uses accumulation of previous
	gradients as the learning rates’ denominators.
	For online topic modeling, the magnitude
	of gradients is very large. It causes
	learning rates to shrink very quickly, so the
	parameters cannot fully converge until the
	training ends},
  url       = {https://www.aclweb.org/anthology/D17-1046}
}

