@inproceedings{eklund-etal-2023-empirical,
title = "An Empirical Configuration Study of a Common Document Clustering Pipeline",
author = "Eklund, Anton and
Forsman, Mona and
Drewes, Frank",
editor = "Derczynski, Leon",
booktitle = "Northern European Journal of Language Technology, Volume 9",
year = "2023",
address = {Link{\"o}ping, Sweden},
publisher = {Link{\"o}ping University Electronic Press},
url = "https://aclanthology.org/2023.nejlt-1.7",
doi = "https://doi.org/10.3384/nejlt.2000-1533.2023.4396",
abstract = "Document clustering is frequently used in applications of natural language processing, e.g. to classify news articles or creating topic models. In this paper, we study document clustering with the common clustering pipeline that includes vectorization with BERT or Doc2Vec, dimension reduction with PCA or UMAP, and clustering with K-Means or HDBSCAN. We discuss the inter- actions of the different components in the pipeline, parameter settings, and how to determine an appropriate number of dimensions. The results suggest that BERT embeddings combined with UMAP dimension reduction to no less than 15 dimensions provides a good basis for clustering, regardless of the specific clustering algorithm used. Moreover, while UMAP performed better than PCA in our experiments, tuning the UMAP settings showed little impact on the overall performance. Hence, we recommend configuring UMAP so as to optimize its time efficiency. According to our topic model evaluation, the combination of BERT and UMAP, also used in BERTopic, performs best. A topic model based on this pipeline typically benefits from a large number of clusters.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="eklund-etal-2023-empirical">
<titleInfo>
<title>An Empirical Configuration Study of a Common Document Clustering Pipeline</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anton</namePart>
<namePart type="family">Eklund</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mona</namePart>
<namePart type="family">Forsman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frank</namePart>
<namePart type="family">Drewes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Northern European Journal of Language Technology, Volume 9</title>
</titleInfo>
<name type="personal">
<namePart type="given">Leon</namePart>
<namePart type="family">Derczynski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Linköping University Electronic Press</publisher>
<place>
<placeTerm type="text">Linköping, Sweden</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Document clustering is frequently used in applications of natural language processing, e.g. to classify news articles or creating topic models. In this paper, we study document clustering with the common clustering pipeline that includes vectorization with BERT or Doc2Vec, dimension reduction with PCA or UMAP, and clustering with K-Means or HDBSCAN. We discuss the inter- actions of the different components in the pipeline, parameter settings, and how to determine an appropriate number of dimensions. The results suggest that BERT embeddings combined with UMAP dimension reduction to no less than 15 dimensions provides a good basis for clustering, regardless of the specific clustering algorithm used. Moreover, while UMAP performed better than PCA in our experiments, tuning the UMAP settings showed little impact on the overall performance. Hence, we recommend configuring UMAP so as to optimize its time efficiency. According to our topic model evaluation, the combination of BERT and UMAP, also used in BERTopic, performs best. A topic model based on this pipeline typically benefits from a large number of clusters.</abstract>
<identifier type="citekey">eklund-etal-2023-empirical</identifier>
<identifier type="doi">https://doi.org/10.3384/nejlt.2000-1533.2023.4396</identifier>
<location>
<url>https://aclanthology.org/2023.nejlt-1.7</url>
</location>
<part>
<date>2023</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Empirical Configuration Study of a Common Document Clustering Pipeline
%A Eklund, Anton
%A Forsman, Mona
%A Drewes, Frank
%Y Derczynski, Leon
%S Northern European Journal of Language Technology, Volume 9
%D 2023
%I Linköping University Electronic Press
%C Linköping, Sweden
%F eklund-etal-2023-empirical
%X Document clustering is frequently used in applications of natural language processing, e.g. to classify news articles or creating topic models. In this paper, we study document clustering with the common clustering pipeline that includes vectorization with BERT or Doc2Vec, dimension reduction with PCA or UMAP, and clustering with K-Means or HDBSCAN. We discuss the inter- actions of the different components in the pipeline, parameter settings, and how to determine an appropriate number of dimensions. The results suggest that BERT embeddings combined with UMAP dimension reduction to no less than 15 dimensions provides a good basis for clustering, regardless of the specific clustering algorithm used. Moreover, while UMAP performed better than PCA in our experiments, tuning the UMAP settings showed little impact on the overall performance. Hence, we recommend configuring UMAP so as to optimize its time efficiency. According to our topic model evaluation, the combination of BERT and UMAP, also used in BERTopic, performs best. A topic model based on this pipeline typically benefits from a large number of clusters.
%R https://doi.org/10.3384/nejlt.2000-1533.2023.4396
%U https://aclanthology.org/2023.nejlt-1.7
%U https://doi.org/https://doi.org/10.3384/nejlt.2000-1533.2023.4396
Markdown (Informal)
[An Empirical Configuration Study of a Common Document Clustering Pipeline](https://aclanthology.org/2023.nejlt-1.7) (Eklund et al., NEJLT 2023)
ACL