@inproceedings{poumay-ittoo-2023-evaluating,
title = "Evaluating Unsupervised Hierarchical Topic Models Using a Labeled Dataset",
author = "Poumay, Judicael and
Ittoo, Ashwin",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing",
month = sep,
year = "2023",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2023.ranlp-1.91",
pages = "846--853",
abstract = "Topic modeling is a commonly used method for identifying and extracting topics from a corpus of documents. While several evaluation techniques, such as perplexity and topic coherence, have been developed to assess the quality of extracted topics, they fail to determine whether all topics have been identified and to what extent they have been represented. Additionally, hierarchical topic models have been proposed, but the quality of the hierarchy produced has not been adequately evaluated. This study proposes a novel approach to evaluating topic models that supplements existing methods. Using a labeled dataset, we trained hierarchical topic models in an unsupervised manner and used the known labels to evaluate the accuracy of the results. Our findings indicate that labels encompassing a substantial number of documents achieve high accuracy of over 70{\%}. Although there are 90 labels in the dataset, labels that cover only 1{\%} of the data still achieve an average accuracy of 37.9{\%}, demonstrating the effectiveness of hierarchical topic models even on smaller subsets. Furthermore, we demonstrate that these labels can be used to assess the quality of the topic tree and confirm that hierarchical topic models produce coherent taxonomies for the labels.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="poumay-ittoo-2023-evaluating">
<titleInfo>
<title>Evaluating Unsupervised Hierarchical Topic Models Using a Labeled Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Judicael</namePart>
<namePart type="family">Poumay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashwin</namePart>
<namePart type="family">Ittoo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Topic modeling is a commonly used method for identifying and extracting topics from a corpus of documents. While several evaluation techniques, such as perplexity and topic coherence, have been developed to assess the quality of extracted topics, they fail to determine whether all topics have been identified and to what extent they have been represented. Additionally, hierarchical topic models have been proposed, but the quality of the hierarchy produced has not been adequately evaluated. This study proposes a novel approach to evaluating topic models that supplements existing methods. Using a labeled dataset, we trained hierarchical topic models in an unsupervised manner and used the known labels to evaluate the accuracy of the results. Our findings indicate that labels encompassing a substantial number of documents achieve high accuracy of over 70%. Although there are 90 labels in the dataset, labels that cover only 1% of the data still achieve an average accuracy of 37.9%, demonstrating the effectiveness of hierarchical topic models even on smaller subsets. Furthermore, we demonstrate that these labels can be used to assess the quality of the topic tree and confirm that hierarchical topic models produce coherent taxonomies for the labels.</abstract>
<identifier type="citekey">poumay-ittoo-2023-evaluating</identifier>
<location>
<url>https://aclanthology.org/2023.ranlp-1.91</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>846</start>
<end>853</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Unsupervised Hierarchical Topic Models Using a Labeled Dataset
%A Poumay, Judicael
%A Ittoo, Ashwin
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing
%D 2023
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F poumay-ittoo-2023-evaluating
%X Topic modeling is a commonly used method for identifying and extracting topics from a corpus of documents. While several evaluation techniques, such as perplexity and topic coherence, have been developed to assess the quality of extracted topics, they fail to determine whether all topics have been identified and to what extent they have been represented. Additionally, hierarchical topic models have been proposed, but the quality of the hierarchy produced has not been adequately evaluated. This study proposes a novel approach to evaluating topic models that supplements existing methods. Using a labeled dataset, we trained hierarchical topic models in an unsupervised manner and used the known labels to evaluate the accuracy of the results. Our findings indicate that labels encompassing a substantial number of documents achieve high accuracy of over 70%. Although there are 90 labels in the dataset, labels that cover only 1% of the data still achieve an average accuracy of 37.9%, demonstrating the effectiveness of hierarchical topic models even on smaller subsets. Furthermore, we demonstrate that these labels can be used to assess the quality of the topic tree and confirm that hierarchical topic models produce coherent taxonomies for the labels.
%U https://aclanthology.org/2023.ranlp-1.91
%P 846-853
Markdown (Informal)
[Evaluating Unsupervised Hierarchical Topic Models Using a Labeled Dataset](https://aclanthology.org/2023.ranlp-1.91) (Poumay & Ittoo, RANLP 2023)
ACL