@inproceedings{marcinczuk-etal-2021-text,
title = "Text Document Clustering: {W}ordnet vs. {TF}-{IDF} vs. Word Embeddings",
author = "Marci{\'n}czuk, Micha{\l} and
Gniewkowski, Mateusz and
Walkowiak, Tomasz and
B{\k{e}}dkowski, Marcin",
editor = "Vossen, Piek and
Fellbaum, Christiane",
booktitle = "Proceedings of the 11th Global Wordnet Conference",
month = jan,
year = "2021",
address = "University of South Africa (UNISA)",
publisher = "Global Wordnet Association",
url = "https://aclanthology.org/2021.gwc-1.24",
pages = "207--214",
abstract = "In the paper, we deal with the problem of unsupervised text document clustering for the Polish language. Our goal is to compare the modern approaches based on language modeling (doc2vec and BERT) with the classical ones, i.e., TF-IDF and wordnet-based. The experiments are conducted on three datasets containing qualification descriptions. The experiments{'} results showed that wordnet-based similarity measures could compete and even outperform modern embedding-based approaches.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="marcinczuk-etal-2021-text">
<titleInfo>
<title>Text Document Clustering: Wordnet vs. TF-IDF vs. Word Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michał</namePart>
<namePart type="family">Marcińczuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mateusz</namePart>
<namePart type="family">Gniewkowski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tomasz</namePart>
<namePart type="family">Walkowiak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcin</namePart>
<namePart type="family">Będkowski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 11th Global Wordnet Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Piek</namePart>
<namePart type="family">Vossen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christiane</namePart>
<namePart type="family">Fellbaum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Global Wordnet Association</publisher>
<place>
<placeTerm type="text">University of South Africa (UNISA)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In the paper, we deal with the problem of unsupervised text document clustering for the Polish language. Our goal is to compare the modern approaches based on language modeling (doc2vec and BERT) with the classical ones, i.e., TF-IDF and wordnet-based. The experiments are conducted on three datasets containing qualification descriptions. The experiments’ results showed that wordnet-based similarity measures could compete and even outperform modern embedding-based approaches.</abstract>
<identifier type="citekey">marcinczuk-etal-2021-text</identifier>
<location>
<url>https://aclanthology.org/2021.gwc-1.24</url>
</location>
<part>
<date>2021-01</date>
<extent unit="page">
<start>207</start>
<end>214</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Text Document Clustering: Wordnet vs. TF-IDF vs. Word Embeddings
%A Marcińczuk, Michał
%A Gniewkowski, Mateusz
%A Walkowiak, Tomasz
%A Będkowski, Marcin
%Y Vossen, Piek
%Y Fellbaum, Christiane
%S Proceedings of the 11th Global Wordnet Conference
%D 2021
%8 January
%I Global Wordnet Association
%C University of South Africa (UNISA)
%F marcinczuk-etal-2021-text
%X In the paper, we deal with the problem of unsupervised text document clustering for the Polish language. Our goal is to compare the modern approaches based on language modeling (doc2vec and BERT) with the classical ones, i.e., TF-IDF and wordnet-based. The experiments are conducted on three datasets containing qualification descriptions. The experiments’ results showed that wordnet-based similarity measures could compete and even outperform modern embedding-based approaches.
%U https://aclanthology.org/2021.gwc-1.24
%P 207-214
Markdown (Informal)
[Text Document Clustering: Wordnet vs. TF-IDF vs. Word Embeddings](https://aclanthology.org/2021.gwc-1.24) (Marcińczuk et al., GWC 2021)
ACL