@inproceedings{silburt-etal-2021-fanatic-fast,
title = "{FANATIC}: {FA}st {N}oise-{A}ware {T}op{I}c {C}lustering",
author = "Silburt, Ari and
Subasic, Anja and
Thompson, Evan and
Dsilva, Carmeline and
Fares, Tarec",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2021",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.findings-emnlp.57",
doi = "10.18653/v1/2021.findings-emnlp.57",
pages = "650--663",
abstract = "Extracting salient topics from a collection of documents can be a challenging task when a) the amount of data is large, b) the number of topics is not known a priori, and/or c) {``}topic noise{''} is present. We define {``}topic noise{''} as the collection of documents that are irrelevant to any coherent topic and should be filtered out. By design, most clustering algorithms (e.g. k-means, hierarchical clustering) assign all input documents to one of the available clusters, guaranteeing any topic noise to propagate into the result. To address these challenges, we present a novel algorithm, FANATIC, that efficiently distinguishes documents from genuine topics and those that are topic noise. We also introduce a new Reddit dataset to showcase FANATIC as it contains short, noisy data that is difficult to cluster using most clustering algorithms. We find that FANATIC clusters 500k Reddit titles (of which 20{\%} are topic noise) in 2 minutes and achieves an AMI score of 0.59, in contrast with hdbscan (McInnes et al., 2017), a popular algorithm suited for this type of task, which requires over 7 hours and achieves an AMI of 0.03. Finally, we test FANATIC against a Twitter dataset and find again that it outperforms the other algorithms with an AMI score of 0.60. We make our code and data publicly available.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="silburt-etal-2021-fanatic-fast">
<titleInfo>
<title>FANATIC: FAst Noise-Aware TopIc Clustering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ari</namePart>
<namePart type="family">Silburt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anja</namePart>
<namePart type="family">Subasic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Evan</namePart>
<namePart type="family">Thompson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carmeline</namePart>
<namePart type="family">Dsilva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tarec</namePart>
<namePart type="family">Fares</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2021</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Extracting salient topics from a collection of documents can be a challenging task when a) the amount of data is large, b) the number of topics is not known a priori, and/or c) “topic noise” is present. We define “topic noise” as the collection of documents that are irrelevant to any coherent topic and should be filtered out. By design, most clustering algorithms (e.g. k-means, hierarchical clustering) assign all input documents to one of the available clusters, guaranteeing any topic noise to propagate into the result. To address these challenges, we present a novel algorithm, FANATIC, that efficiently distinguishes documents from genuine topics and those that are topic noise. We also introduce a new Reddit dataset to showcase FANATIC as it contains short, noisy data that is difficult to cluster using most clustering algorithms. We find that FANATIC clusters 500k Reddit titles (of which 20% are topic noise) in 2 minutes and achieves an AMI score of 0.59, in contrast with hdbscan (McInnes et al., 2017), a popular algorithm suited for this type of task, which requires over 7 hours and achieves an AMI of 0.03. Finally, we test FANATIC against a Twitter dataset and find again that it outperforms the other algorithms with an AMI score of 0.60. We make our code and data publicly available.</abstract>
<identifier type="citekey">silburt-etal-2021-fanatic-fast</identifier>
<identifier type="doi">10.18653/v1/2021.findings-emnlp.57</identifier>
<location>
<url>https://aclanthology.org/2021.findings-emnlp.57</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>650</start>
<end>663</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FANATIC: FAst Noise-Aware TopIc Clustering
%A Silburt, Ari
%A Subasic, Anja
%A Thompson, Evan
%A Dsilva, Carmeline
%A Fares, Tarec
%S Findings of the Association for Computational Linguistics: EMNLP 2021
%D 2021
%8 November
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic
%F silburt-etal-2021-fanatic-fast
%X Extracting salient topics from a collection of documents can be a challenging task when a) the amount of data is large, b) the number of topics is not known a priori, and/or c) “topic noise” is present. We define “topic noise” as the collection of documents that are irrelevant to any coherent topic and should be filtered out. By design, most clustering algorithms (e.g. k-means, hierarchical clustering) assign all input documents to one of the available clusters, guaranteeing any topic noise to propagate into the result. To address these challenges, we present a novel algorithm, FANATIC, that efficiently distinguishes documents from genuine topics and those that are topic noise. We also introduce a new Reddit dataset to showcase FANATIC as it contains short, noisy data that is difficult to cluster using most clustering algorithms. We find that FANATIC clusters 500k Reddit titles (of which 20% are topic noise) in 2 minutes and achieves an AMI score of 0.59, in contrast with hdbscan (McInnes et al., 2017), a popular algorithm suited for this type of task, which requires over 7 hours and achieves an AMI of 0.03. Finally, we test FANATIC against a Twitter dataset and find again that it outperforms the other algorithms with an AMI score of 0.60. We make our code and data publicly available.
%R 10.18653/v1/2021.findings-emnlp.57
%U https://aclanthology.org/2021.findings-emnlp.57
%U https://doi.org/10.18653/v1/2021.findings-emnlp.57
%P 650-663
Markdown (Informal)
[FANATIC: FAst Noise-Aware TopIc Clustering](https://aclanthology.org/2021.findings-emnlp.57) (Silburt et al., Findings 2021)
ACL
- Ari Silburt, Anja Subasic, Evan Thompson, Carmeline Dsilva, and Tarec Fares. 2021. FANATIC: FAst Noise-Aware TopIc Clustering. In Findings of the Association for Computational Linguistics: EMNLP 2021, pages 650–663, Punta Cana, Dominican Republic. Association for Computational Linguistics.