@inproceedings{viegas-etal-2020-cluhtm,
title = "{C}lu{HTM} - Semantic Hierarchical Topic Modeling based on {C}lu{W}ords",
author = "Viegas, Felipe and
Cunha, Washington and
Gomes, Christian and
Pereira, Ant{\^o}nio and
Rocha, Leonardo and
Goncalves, Marcos",
editor = "Jurafsky, Dan and
Chai, Joyce and
Schluter, Natalie and
Tetreault, Joel",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-main.724",
doi = "10.18653/v1/2020.acl-main.724",
pages = "8138--8150",
abstract = "Hierarchical Topic modeling (HTM) exploits latent topics and relationships among them as a powerful tool for data analysis and exploration. Despite advantages over traditional topic modeling, HTM poses its own challenges, such as (1) topic incoherence, (2) unreasonable (hierarchical) structure, and (3) issues related to the definition of the {``}ideal{''} number of topics and depth of the hierarchy. In this paper, we advance the state-of-the-art on HTM by means of the design and evaluation of CluHTM, a novel non-probabilistic hierarchical matrix factorization aimed at solving the specific issues of HTM. CluHTM{'}s novel contributions include: (i) the exploration of richer text representation that encapsulates both, global (dataset level) and local semantic information {--} when combined, these pieces of information help to solve the topic incoherence problem as well as issues related to the unreasonable structure; (ii) the exploitation of a stability analysis metric for defining the number of topics and the {``}shape{''} the hierarchical structure. In our evaluation, considering twelve datasets and seven state-of-the-art baselines, CluHTM outperformed the baselines in the vast majority of the cases, with gains of around 500{\%} over the strongest state-of-the-art baselines. We also provide qualitative and quantitative statistical analyses of why our solution works so well.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="viegas-etal-2020-cluhtm">
<titleInfo>
<title>CluHTM - Semantic Hierarchical Topic Modeling based on CluWords</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felipe</namePart>
<namePart type="family">Viegas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Washington</namePart>
<namePart type="family">Cunha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Gomes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antônio</namePart>
<namePart type="family">Pereira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leonardo</namePart>
<namePart type="family">Rocha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Goncalves</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Jurafsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Chai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Natalie</namePart>
<namePart type="family">Schluter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joel</namePart>
<namePart type="family">Tetreault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Hierarchical Topic modeling (HTM) exploits latent topics and relationships among them as a powerful tool for data analysis and exploration. Despite advantages over traditional topic modeling, HTM poses its own challenges, such as (1) topic incoherence, (2) unreasonable (hierarchical) structure, and (3) issues related to the definition of the “ideal” number of topics and depth of the hierarchy. In this paper, we advance the state-of-the-art on HTM by means of the design and evaluation of CluHTM, a novel non-probabilistic hierarchical matrix factorization aimed at solving the specific issues of HTM. CluHTM’s novel contributions include: (i) the exploration of richer text representation that encapsulates both, global (dataset level) and local semantic information – when combined, these pieces of information help to solve the topic incoherence problem as well as issues related to the unreasonable structure; (ii) the exploitation of a stability analysis metric for defining the number of topics and the “shape” the hierarchical structure. In our evaluation, considering twelve datasets and seven state-of-the-art baselines, CluHTM outperformed the baselines in the vast majority of the cases, with gains of around 500% over the strongest state-of-the-art baselines. We also provide qualitative and quantitative statistical analyses of why our solution works so well.</abstract>
<identifier type="citekey">viegas-etal-2020-cluhtm</identifier>
<identifier type="doi">10.18653/v1/2020.acl-main.724</identifier>
<location>
<url>https://aclanthology.org/2020.acl-main.724</url>
</location>
<part>
<date>2020-07</date>
<extent unit="page">
<start>8138</start>
<end>8150</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CluHTM - Semantic Hierarchical Topic Modeling based on CluWords
%A Viegas, Felipe
%A Cunha, Washington
%A Gomes, Christian
%A Pereira, Antônio
%A Rocha, Leonardo
%A Goncalves, Marcos
%Y Jurafsky, Dan
%Y Chai, Joyce
%Y Schluter, Natalie
%Y Tetreault, Joel
%S Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics
%D 2020
%8 July
%I Association for Computational Linguistics
%C Online
%F viegas-etal-2020-cluhtm
%X Hierarchical Topic modeling (HTM) exploits latent topics and relationships among them as a powerful tool for data analysis and exploration. Despite advantages over traditional topic modeling, HTM poses its own challenges, such as (1) topic incoherence, (2) unreasonable (hierarchical) structure, and (3) issues related to the definition of the “ideal” number of topics and depth of the hierarchy. In this paper, we advance the state-of-the-art on HTM by means of the design and evaluation of CluHTM, a novel non-probabilistic hierarchical matrix factorization aimed at solving the specific issues of HTM. CluHTM’s novel contributions include: (i) the exploration of richer text representation that encapsulates both, global (dataset level) and local semantic information – when combined, these pieces of information help to solve the topic incoherence problem as well as issues related to the unreasonable structure; (ii) the exploitation of a stability analysis metric for defining the number of topics and the “shape” the hierarchical structure. In our evaluation, considering twelve datasets and seven state-of-the-art baselines, CluHTM outperformed the baselines in the vast majority of the cases, with gains of around 500% over the strongest state-of-the-art baselines. We also provide qualitative and quantitative statistical analyses of why our solution works so well.
%R 10.18653/v1/2020.acl-main.724
%U https://aclanthology.org/2020.acl-main.724
%U https://doi.org/10.18653/v1/2020.acl-main.724
%P 8138-8150
Markdown (Informal)
[CluHTM - Semantic Hierarchical Topic Modeling based on CluWords](https://aclanthology.org/2020.acl-main.724) (Viegas et al., ACL 2020)
ACL
- Felipe Viegas, Washington Cunha, Christian Gomes, Antônio Pereira, Leonardo Rocha, and Marcos Goncalves. 2020. CluHTM - Semantic Hierarchical Topic Modeling based on CluWords. In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 8138–8150, Online. Association for Computational Linguistics.