@inproceedings{sia-duh-2021-adaptive,
title = "Adaptive Mixed Component {LDA} for Low Resource Topic Modeling",
author = "Sia, Suzanna and
Duh, Kevin",
editor = "Merlo, Paola and
Tiedemann, Jorg and
Tsarfaty, Reut",
booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume",
month = apr,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eacl-main.209",
doi = "10.18653/v1/2021.eacl-main.209",
pages = "2451--2469",
abstract = "Probabilistic topic models in low data resource scenarios are faced with less reliable estimates due to sparsity of discrete word co-occurrence counts, and do not have the luxury of retraining word or topic embeddings using neural methods. In this challenging resource constrained setting, we explore mixture models which interpolate between the discrete and continuous topic-word distributions that utilise pre-trained embeddings to improve topic coherence. We introduce an automatic trade-off between the discrete and continuous representations via an adaptive mixture coefficient, which places greater weight on the discrete representation when the corpus statistics are more reliable. The adaptive mixture coefficient takes into account global corpus statistics, and the uncertainty in each topic{'}s continuous distributions. Our approach outperforms the fully discrete, fully continuous, and static mixture model on topic coherence in low resource settings. We additionally demonstrate the generalisability of our method by extending it to handle multilingual document collections.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sia-duh-2021-adaptive">
<titleInfo>
<title>Adaptive Mixed Component LDA for Low Resource Topic Modeling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Suzanna</namePart>
<namePart type="family">Sia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume</title>
</titleInfo>
<name type="personal">
<namePart type="given">Paola</namePart>
<namePart type="family">Merlo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reut</namePart>
<namePart type="family">Tsarfaty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Probabilistic topic models in low data resource scenarios are faced with less reliable estimates due to sparsity of discrete word co-occurrence counts, and do not have the luxury of retraining word or topic embeddings using neural methods. In this challenging resource constrained setting, we explore mixture models which interpolate between the discrete and continuous topic-word distributions that utilise pre-trained embeddings to improve topic coherence. We introduce an automatic trade-off between the discrete and continuous representations via an adaptive mixture coefficient, which places greater weight on the discrete representation when the corpus statistics are more reliable. The adaptive mixture coefficient takes into account global corpus statistics, and the uncertainty in each topic’s continuous distributions. Our approach outperforms the fully discrete, fully continuous, and static mixture model on topic coherence in low resource settings. We additionally demonstrate the generalisability of our method by extending it to handle multilingual document collections.</abstract>
<identifier type="citekey">sia-duh-2021-adaptive</identifier>
<identifier type="doi">10.18653/v1/2021.eacl-main.209</identifier>
<location>
<url>https://aclanthology.org/2021.eacl-main.209</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>2451</start>
<end>2469</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Adaptive Mixed Component LDA for Low Resource Topic Modeling
%A Sia, Suzanna
%A Duh, Kevin
%Y Merlo, Paola
%Y Tiedemann, Jorg
%Y Tsarfaty, Reut
%S Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume
%D 2021
%8 April
%I Association for Computational Linguistics
%C Online
%F sia-duh-2021-adaptive
%X Probabilistic topic models in low data resource scenarios are faced with less reliable estimates due to sparsity of discrete word co-occurrence counts, and do not have the luxury of retraining word or topic embeddings using neural methods. In this challenging resource constrained setting, we explore mixture models which interpolate between the discrete and continuous topic-word distributions that utilise pre-trained embeddings to improve topic coherence. We introduce an automatic trade-off between the discrete and continuous representations via an adaptive mixture coefficient, which places greater weight on the discrete representation when the corpus statistics are more reliable. The adaptive mixture coefficient takes into account global corpus statistics, and the uncertainty in each topic’s continuous distributions. Our approach outperforms the fully discrete, fully continuous, and static mixture model on topic coherence in low resource settings. We additionally demonstrate the generalisability of our method by extending it to handle multilingual document collections.
%R 10.18653/v1/2021.eacl-main.209
%U https://aclanthology.org/2021.eacl-main.209
%U https://doi.org/10.18653/v1/2021.eacl-main.209
%P 2451-2469
Markdown (Informal)
[Adaptive Mixed Component LDA for Low Resource Topic Modeling](https://aclanthology.org/2021.eacl-main.209) (Sia & Duh, EACL 2021)
ACL