@inproceedings{islam-foulds-2019-scalable,
title = "Scalable Collapsed Inference for High-Dimensional Topic Models",
author = "Islam, Rashidul and
Foulds, James",
editor = "Burstein, Jill and
Doran, Christy and
Solorio, Thamar",
booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
month = jun,
year = "2019",
address = "Minneapolis, Minnesota",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/N19-1291",
doi = "10.18653/v1/N19-1291",
pages = "2836--2845",
abstract = "The bigger the corpus, the more topics it can potentially support. To truly make full use of massive text corpora, a topic model inference algorithm must therefore scale efficiently in 1) documents and 2) topics, while 3) achieving accurate inference. Previous methods have achieved two out of three of these criteria simultaneously, but never all three at once. In this paper, we develop an online inference algorithm for topic models which leverages stochasticity to scale well in the number of documents, sparsity to scale well in the number of topics, and which operates in the collapsed representation of the topic model for improved accuracy and run-time performance. We use a Monte Carlo inner loop in the online setting to approximate the collapsed variational Bayes updates in a sparse and efficient way, which we accomplish via the MetropolisHastings Walker method. We showcase our algorithm on LDA and the recently proposed mixed membership skip-gram topic model. Our method requires only amortized $O(k_{d})$ computation per word token instead of $O(K)$ operations, where the number of topics occurring for a particular document $k_{d}\ll$ the total number of topics in the corpus $K$, to converge to a high-quality solution.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="islam-foulds-2019-scalable">
<titleInfo>
<title>Scalable Collapsed Inference for High-Dimensional Topic Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rashidul</namePart>
<namePart type="family">Islam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Foulds</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jill</namePart>
<namePart type="family">Burstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christy</namePart>
<namePart type="family">Doran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thamar</namePart>
<namePart type="family">Solorio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Minneapolis, Minnesota</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The bigger the corpus, the more topics it can potentially support. To truly make full use of massive text corpora, a topic model inference algorithm must therefore scale efficiently in 1) documents and 2) topics, while 3) achieving accurate inference. Previous methods have achieved two out of three of these criteria simultaneously, but never all three at once. In this paper, we develop an online inference algorithm for topic models which leverages stochasticity to scale well in the number of documents, sparsity to scale well in the number of topics, and which operates in the collapsed representation of the topic model for improved accuracy and run-time performance. We use a Monte Carlo inner loop in the online setting to approximate the collapsed variational Bayes updates in a sparse and efficient way, which we accomplish via the MetropolisHastings Walker method. We showcase our algorithm on LDA and the recently proposed mixed membership skip-gram topic model. Our method requires only amortized O(k_d) computation per word token instead of O(K) operations, where the number of topics occurring for a particular document k_dłl the total number of topics in the corpus K, to converge to a high-quality solution.</abstract>
<identifier type="citekey">islam-foulds-2019-scalable</identifier>
<identifier type="doi">10.18653/v1/N19-1291</identifier>
<location>
<url>https://aclanthology.org/N19-1291</url>
</location>
<part>
<date>2019-06</date>
<extent unit="page">
<start>2836</start>
<end>2845</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Scalable Collapsed Inference for High-Dimensional Topic Models
%A Islam, Rashidul
%A Foulds, James
%Y Burstein, Jill
%Y Doran, Christy
%Y Solorio, Thamar
%S Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)
%D 2019
%8 June
%I Association for Computational Linguistics
%C Minneapolis, Minnesota
%F islam-foulds-2019-scalable
%X The bigger the corpus, the more topics it can potentially support. To truly make full use of massive text corpora, a topic model inference algorithm must therefore scale efficiently in 1) documents and 2) topics, while 3) achieving accurate inference. Previous methods have achieved two out of three of these criteria simultaneously, but never all three at once. In this paper, we develop an online inference algorithm for topic models which leverages stochasticity to scale well in the number of documents, sparsity to scale well in the number of topics, and which operates in the collapsed representation of the topic model for improved accuracy and run-time performance. We use a Monte Carlo inner loop in the online setting to approximate the collapsed variational Bayes updates in a sparse and efficient way, which we accomplish via the MetropolisHastings Walker method. We showcase our algorithm on LDA and the recently proposed mixed membership skip-gram topic model. Our method requires only amortized O(k_d) computation per word token instead of O(K) operations, where the number of topics occurring for a particular document k_dłl the total number of topics in the corpus K, to converge to a high-quality solution.
%R 10.18653/v1/N19-1291
%U https://aclanthology.org/N19-1291
%U https://doi.org/10.18653/v1/N19-1291
%P 2836-2845
Markdown (Informal)
[Scalable Collapsed Inference for High-Dimensional Topic Models](https://aclanthology.org/N19-1291) (Islam & Foulds, NAACL 2019)
ACL
- Rashidul Islam and James Foulds. 2019. Scalable Collapsed Inference for High-Dimensional Topic Models. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pages 2836–2845, Minneapolis, Minnesota. Association for Computational Linguistics.