@article{thielmann-etal-2024-topics,
title = "Topics in the Haystack: Enhancing Topic Quality through Corpus Expansion",
author = {Thielmann, Anton and
Reuter, Arik and
Seifert, Quentin and
Bergherr, Elisabeth and
S{\"a}fken, Benjamin},
journal = "Computational Linguistics",
volume = "50",
number = "2",
month = jun,
year = "2024",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2024.cl-2.5",
doi = "10.1162/coli_a_00506",
pages = "619--655",
abstract = "Extracting and identifying latent topics in large text corpora have gained increasing importance in Natural Language Processing (NLP). Most models, whether probabilistic models similar to Latent Dirichlet Allocation (LDA) or neural topic models, follow the same underlying approach of topic interpretability and topic extraction. We propose a method that incorporates a deeper understanding of both sentence and document themes, and goes beyond simply analyzing word frequencies in the data. Through simple corpus expansion, our model can detect latent topics that may include uncommon words or neologisms, as well as words not present in the documents themselves. Additionally, we propose several new evaluation metrics based on intruder words and similarity measures in the semantic space. We present correlation coefficients with human identification of intruder words and achieve near-human level results at the word-intrusion task. We demonstrate the competitive performance of our method with a large benchmark study, and achieve superior results compared with state-of-the-art topic modeling and document clustering models. The code is available at the following link: https://github.com/AnFreTh/STREAM.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="thielmann-etal-2024-topics">
<titleInfo>
<title>Topics in the Haystack: Enhancing Topic Quality through Corpus Expansion</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anton</namePart>
<namePart type="family">Thielmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arik</namePart>
<namePart type="family">Reuter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Quentin</namePart>
<namePart type="family">Seifert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elisabeth</namePart>
<namePart type="family">Bergherr</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Säfken</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Extracting and identifying latent topics in large text corpora have gained increasing importance in Natural Language Processing (NLP). Most models, whether probabilistic models similar to Latent Dirichlet Allocation (LDA) or neural topic models, follow the same underlying approach of topic interpretability and topic extraction. We propose a method that incorporates a deeper understanding of both sentence and document themes, and goes beyond simply analyzing word frequencies in the data. Through simple corpus expansion, our model can detect latent topics that may include uncommon words or neologisms, as well as words not present in the documents themselves. Additionally, we propose several new evaluation metrics based on intruder words and similarity measures in the semantic space. We present correlation coefficients with human identification of intruder words and achieve near-human level results at the word-intrusion task. We demonstrate the competitive performance of our method with a large benchmark study, and achieve superior results compared with state-of-the-art topic modeling and document clustering models. The code is available at the following link: https://github.com/AnFreTh/STREAM.</abstract>
<identifier type="citekey">thielmann-etal-2024-topics</identifier>
<identifier type="doi">10.1162/coli_a_00506</identifier>
<location>
<url>https://aclanthology.org/2024.cl-2.5</url>
</location>
<part>
<date>2024-06</date>
<detail type="volume"><number>50</number></detail>
<detail type="issue"><number>2</number></detail>
<extent unit="page">
<start>619</start>
<end>655</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Topics in the Haystack: Enhancing Topic Quality through Corpus Expansion
%A Thielmann, Anton
%A Reuter, Arik
%A Seifert, Quentin
%A Bergherr, Elisabeth
%A Säfken, Benjamin
%J Computational Linguistics
%D 2024
%8 June
%V 50
%N 2
%I MIT Press
%C Cambridge, MA
%F thielmann-etal-2024-topics
%X Extracting and identifying latent topics in large text corpora have gained increasing importance in Natural Language Processing (NLP). Most models, whether probabilistic models similar to Latent Dirichlet Allocation (LDA) or neural topic models, follow the same underlying approach of topic interpretability and topic extraction. We propose a method that incorporates a deeper understanding of both sentence and document themes, and goes beyond simply analyzing word frequencies in the data. Through simple corpus expansion, our model can detect latent topics that may include uncommon words or neologisms, as well as words not present in the documents themselves. Additionally, we propose several new evaluation metrics based on intruder words and similarity measures in the semantic space. We present correlation coefficients with human identification of intruder words and achieve near-human level results at the word-intrusion task. We demonstrate the competitive performance of our method with a large benchmark study, and achieve superior results compared with state-of-the-art topic modeling and document clustering models. The code is available at the following link: https://github.com/AnFreTh/STREAM.
%R 10.1162/coli_a_00506
%U https://aclanthology.org/2024.cl-2.5
%U https://doi.org/10.1162/coli_a_00506
%P 619-655
Markdown (Informal)
[Topics in the Haystack: Enhancing Topic Quality through Corpus Expansion](https://aclanthology.org/2024.cl-2.5) (Thielmann et al., CL 2024)
ACL