@inproceedings{ma-etal-2025-cast,
title = "{CAST}: Corpus-Aware Self-similarity Enhanced Topic modelling",
author = "Ma, Yanan and
Xiao, Chenghao and
Yuan, Chenhan and
Veer, Sabine N Van Der and
Hassan, Lamiece and
Lin, Chenghua and
Nenadic, Goran",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.386/",
doi = "10.18653/v1/2025.naacl-long.386",
pages = "7548--7561",
ISBN = "979-8-89176-189-6",
abstract = "Topic modelling is a pivotal unsupervised machine learning technique for extracting valuable insights from large document collections. Existing neural topic modelling methods often encode contextual information of documents, while ignoring contextual details of candidate centroid words, leading to the inaccurate selection of topic words due to the *contextualization gap*. In parallel, it is found that functional words are frequently selected over topical words. To address these limitations, we introduce **CAST**: **C**orpus-**A**ware **S**elf-similarity Enhanced **T**opic modelling, a novel topic modelling method that builds upon candidate centroid word embeddings contextualized on the dataset, and a novel self-similarity-based method to filter out less meaningful tokens. Inspired by findings in contrastive learning that self-similarities of functional token embeddings in different contexts are much lower than topical tokens, we find self-similarity to be an effective metric to prevent functional words from acting as candidate topic words. Our approach significantly enhances the coherence and diversity of generated topics, as well as the topic model{'}s ability to handle noisy data. Experiments on news benchmark datasets and one Twitter dataset demonstrate the method{'}s superiority in generating coherent, diverse topics, and handling noisy data, outperforming strong baselines."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ma-etal-2025-cast">
<titleInfo>
<title>CAST: Corpus-Aware Self-similarity Enhanced Topic modelling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yanan</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenghao</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenhan</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sabine</namePart>
<namePart type="given">N</namePart>
<namePart type="given">Van</namePart>
<namePart type="given">Der</namePart>
<namePart type="family">Veer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lamiece</namePart>
<namePart type="family">Hassan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenghua</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Goran</namePart>
<namePart type="family">Nenadic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Topic modelling is a pivotal unsupervised machine learning technique for extracting valuable insights from large document collections. Existing neural topic modelling methods often encode contextual information of documents, while ignoring contextual details of candidate centroid words, leading to the inaccurate selection of topic words due to the *contextualization gap*. In parallel, it is found that functional words are frequently selected over topical words. To address these limitations, we introduce **CAST**: **C**orpus-**A**ware **S**elf-similarity Enhanced **T**opic modelling, a novel topic modelling method that builds upon candidate centroid word embeddings contextualized on the dataset, and a novel self-similarity-based method to filter out less meaningful tokens. Inspired by findings in contrastive learning that self-similarities of functional token embeddings in different contexts are much lower than topical tokens, we find self-similarity to be an effective metric to prevent functional words from acting as candidate topic words. Our approach significantly enhances the coherence and diversity of generated topics, as well as the topic model’s ability to handle noisy data. Experiments on news benchmark datasets and one Twitter dataset demonstrate the method’s superiority in generating coherent, diverse topics, and handling noisy data, outperforming strong baselines.</abstract>
<identifier type="citekey">ma-etal-2025-cast</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.386</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.386/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>7548</start>
<end>7561</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CAST: Corpus-Aware Self-similarity Enhanced Topic modelling
%A Ma, Yanan
%A Xiao, Chenghao
%A Yuan, Chenhan
%A Veer, Sabine N. Van Der
%A Hassan, Lamiece
%A Lin, Chenghua
%A Nenadic, Goran
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F ma-etal-2025-cast
%X Topic modelling is a pivotal unsupervised machine learning technique for extracting valuable insights from large document collections. Existing neural topic modelling methods often encode contextual information of documents, while ignoring contextual details of candidate centroid words, leading to the inaccurate selection of topic words due to the *contextualization gap*. In parallel, it is found that functional words are frequently selected over topical words. To address these limitations, we introduce **CAST**: **C**orpus-**A**ware **S**elf-similarity Enhanced **T**opic modelling, a novel topic modelling method that builds upon candidate centroid word embeddings contextualized on the dataset, and a novel self-similarity-based method to filter out less meaningful tokens. Inspired by findings in contrastive learning that self-similarities of functional token embeddings in different contexts are much lower than topical tokens, we find self-similarity to be an effective metric to prevent functional words from acting as candidate topic words. Our approach significantly enhances the coherence and diversity of generated topics, as well as the topic model’s ability to handle noisy data. Experiments on news benchmark datasets and one Twitter dataset demonstrate the method’s superiority in generating coherent, diverse topics, and handling noisy data, outperforming strong baselines.
%R 10.18653/v1/2025.naacl-long.386
%U https://aclanthology.org/2025.naacl-long.386/
%U https://doi.org/10.18653/v1/2025.naacl-long.386
%P 7548-7561
Markdown (Informal)
[CAST: Corpus-Aware Self-similarity Enhanced Topic modelling](https://aclanthology.org/2025.naacl-long.386/) (Ma et al., NAACL 2025)
ACL
- Yanan Ma, Chenghao Xiao, Chenhan Yuan, Sabine N Van Der Veer, Lamiece Hassan, Chenghua Lin, and Goran Nenadic. 2025. CAST: Corpus-Aware Self-similarity Enhanced Topic modelling. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 7548–7561, Albuquerque, New Mexico. Association for Computational Linguistics.