@article{viegas-etal-2025-exploiting,
title = "Exploiting Contextual Embeddings in Hierarchical Topic Modeling and Investigating the Limits of the Current Evaluation Metrics",
author = "Viegas, Felipe and
Pereira, Antonio and
Cunha, Washington and
Fran{\c{c}}a, Celso and
Andrade, Claudio and
Tuler, Elisa and
Rocha, Leonardo and
Gon{\c{c}}alves, Marcos Andr{\'e}",
journal = "Computational Linguistics",
volume = "51",
number = "3",
month = sep,
year = "2025",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2025.cl-3.5/",
doi = "10.1162/coli_a_00543",
pages = "843--883",
abstract = "We investigate two essential challenges in the context of hierarchical topic modeling (HTM){---}(i) the impact of data representation and (ii) topic evaluation. The data representation directly influences the performance of the topic generation, and the impact of new representations such as contextual embeddings in this task has been under-investigated. Topic evaluation, responsible for driving the advances in the field, assesses the overall quality of the topic generation process. HTM studies exploit the exact topic modeling (TM) evaluation metrics as traditional TM to measure the quality of topics. One significant result of our work is demonstrating that the HTM{'}s hierarchical nature demands novel ways of evaluating the quality of topics. As our main contribution, we propose two new topic quality metrics to assess the topical quality of the hierarchical structures. Uniqueness considers topic topological consistency, while the Semantic Hierarchical Structure (SHS) captures the semantic relatedness of the hierarchies. We also present an additional advance to the state-of-the-art by proposing the c-CluHTM. To the best of our knowledge, c-CluHTM is the first method that exploits contextual embeddings into NMF in HTM tasks. c-CluHTM enhances the topics' semantics while preserving the hierarchical structure. We perform an experimental evaluation, and our results demonstrate the superiority of our proposal with gains between 12{\%} and 21{\%}, regarding NPMI and Coherence over the best baselines. Regarding the newly proposed metrics, our results reveal that Uniqueness and SHS can capture relevant information about the structure of the hierarchical topics that traditional metrics cannot."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="viegas-etal-2025-exploiting">
<titleInfo>
<title>Exploiting Contextual Embeddings in Hierarchical Topic Modeling and Investigating the Limits of the Current Evaluation Metrics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felipe</namePart>
<namePart type="family">Viegas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Pereira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Washington</namePart>
<namePart type="family">Cunha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Celso</namePart>
<namePart type="family">França</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudio</namePart>
<namePart type="family">Andrade</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elisa</namePart>
<namePart type="family">Tuler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leonardo</namePart>
<namePart type="family">Rocha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="given">André</namePart>
<namePart type="family">Gonçalves</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>We investigate two essential challenges in the context of hierarchical topic modeling (HTM)—(i) the impact of data representation and (ii) topic evaluation. The data representation directly influences the performance of the topic generation, and the impact of new representations such as contextual embeddings in this task has been under-investigated. Topic evaluation, responsible for driving the advances in the field, assesses the overall quality of the topic generation process. HTM studies exploit the exact topic modeling (TM) evaluation metrics as traditional TM to measure the quality of topics. One significant result of our work is demonstrating that the HTM’s hierarchical nature demands novel ways of evaluating the quality of topics. As our main contribution, we propose two new topic quality metrics to assess the topical quality of the hierarchical structures. Uniqueness considers topic topological consistency, while the Semantic Hierarchical Structure (SHS) captures the semantic relatedness of the hierarchies. We also present an additional advance to the state-of-the-art by proposing the c-CluHTM. To the best of our knowledge, c-CluHTM is the first method that exploits contextual embeddings into NMF in HTM tasks. c-CluHTM enhances the topics’ semantics while preserving the hierarchical structure. We perform an experimental evaluation, and our results demonstrate the superiority of our proposal with gains between 12% and 21%, regarding NPMI and Coherence over the best baselines. Regarding the newly proposed metrics, our results reveal that Uniqueness and SHS can capture relevant information about the structure of the hierarchical topics that traditional metrics cannot.</abstract>
<identifier type="citekey">viegas-etal-2025-exploiting</identifier>
<identifier type="doi">10.1162/coli_a_00543</identifier>
<location>
<url>https://aclanthology.org/2025.cl-3.5/</url>
</location>
<part>
<date>2025-09</date>
<detail type="volume"><number>51</number></detail>
<detail type="issue"><number>3</number></detail>
<extent unit="page">
<start>843</start>
<end>883</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Exploiting Contextual Embeddings in Hierarchical Topic Modeling and Investigating the Limits of the Current Evaluation Metrics
%A Viegas, Felipe
%A Pereira, Antonio
%A Cunha, Washington
%A França, Celso
%A Andrade, Claudio
%A Tuler, Elisa
%A Rocha, Leonardo
%A Gonçalves, Marcos André
%J Computational Linguistics
%D 2025
%8 September
%V 51
%N 3
%I MIT Press
%C Cambridge, MA
%F viegas-etal-2025-exploiting
%X We investigate two essential challenges in the context of hierarchical topic modeling (HTM)—(i) the impact of data representation and (ii) topic evaluation. The data representation directly influences the performance of the topic generation, and the impact of new representations such as contextual embeddings in this task has been under-investigated. Topic evaluation, responsible for driving the advances in the field, assesses the overall quality of the topic generation process. HTM studies exploit the exact topic modeling (TM) evaluation metrics as traditional TM to measure the quality of topics. One significant result of our work is demonstrating that the HTM’s hierarchical nature demands novel ways of evaluating the quality of topics. As our main contribution, we propose two new topic quality metrics to assess the topical quality of the hierarchical structures. Uniqueness considers topic topological consistency, while the Semantic Hierarchical Structure (SHS) captures the semantic relatedness of the hierarchies. We also present an additional advance to the state-of-the-art by proposing the c-CluHTM. To the best of our knowledge, c-CluHTM is the first method that exploits contextual embeddings into NMF in HTM tasks. c-CluHTM enhances the topics’ semantics while preserving the hierarchical structure. We perform an experimental evaluation, and our results demonstrate the superiority of our proposal with gains between 12% and 21%, regarding NPMI and Coherence over the best baselines. Regarding the newly proposed metrics, our results reveal that Uniqueness and SHS can capture relevant information about the structure of the hierarchical topics that traditional metrics cannot.
%R 10.1162/coli_a_00543
%U https://aclanthology.org/2025.cl-3.5/
%U https://doi.org/10.1162/coli_a_00543
%P 843-883
Markdown (Informal)
[Exploiting Contextual Embeddings in Hierarchical Topic Modeling and Investigating the Limits of the Current Evaluation Metrics](https://aclanthology.org/2025.cl-3.5/) (Viegas et al., CL 2025)
ACL