@inproceedings{walkowiak-gniewkowski-2019-evaluation,
title = "Evaluation of vector embedding models in clustering of text documents",
author = "Walkowiak, Tomasz and
Gniewkowski, Mateusz",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019)",
month = sep,
year = "2019",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd.",
url = "https://aclanthology.org/R19-1149",
doi = "10.26615/978-954-452-056-4_149",
pages = "1304--1311",
abstract = "The paper presents an evaluation of word embedding models in clustering of texts in the Polish language. Authors verified six different embedding models, starting from widely used word2vec, across fastText with character n-grams embedding, to deep learning-based ELMo and BERT. Moreover, four standardisation methods, three distance measures and four clustering methods were evaluated. The analysis was performed on two corpora of texts in Polish classified into subjects. The Adjusted Mutual Information (AMI) metric was used to verify the quality of clustering results. The performed experiments show that Skipgram models with n-grams character embedding, built on KGR10 corpus and provided by Clarin-PL, outperforms other publicly available models for Polish. Moreover, presented results suggest that Yeo{--}Johnson transformation for document vectors standardisation and Agglomerative Clustering with a cosine distance should be used for grouping of text documents.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="walkowiak-gniewkowski-2019-evaluation">
<titleInfo>
<title>Evaluation of vector embedding models in clustering of text documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tomasz</namePart>
<namePart type="family">Walkowiak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mateusz</namePart>
<namePart type="family">Gniewkowski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd.</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The paper presents an evaluation of word embedding models in clustering of texts in the Polish language. Authors verified six different embedding models, starting from widely used word2vec, across fastText with character n-grams embedding, to deep learning-based ELMo and BERT. Moreover, four standardisation methods, three distance measures and four clustering methods were evaluated. The analysis was performed on two corpora of texts in Polish classified into subjects. The Adjusted Mutual Information (AMI) metric was used to verify the quality of clustering results. The performed experiments show that Skipgram models with n-grams character embedding, built on KGR10 corpus and provided by Clarin-PL, outperforms other publicly available models for Polish. Moreover, presented results suggest that Yeo–Johnson transformation for document vectors standardisation and Agglomerative Clustering with a cosine distance should be used for grouping of text documents.</abstract>
<identifier type="citekey">walkowiak-gniewkowski-2019-evaluation</identifier>
<identifier type="doi">10.26615/978-954-452-056-4_149</identifier>
<location>
<url>https://aclanthology.org/R19-1149</url>
</location>
<part>
<date>2019-09</date>
<extent unit="page">
<start>1304</start>
<end>1311</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluation of vector embedding models in clustering of text documents
%A Walkowiak, Tomasz
%A Gniewkowski, Mateusz
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019)
%D 2019
%8 September
%I INCOMA Ltd.
%C Varna, Bulgaria
%F walkowiak-gniewkowski-2019-evaluation
%X The paper presents an evaluation of word embedding models in clustering of texts in the Polish language. Authors verified six different embedding models, starting from widely used word2vec, across fastText with character n-grams embedding, to deep learning-based ELMo and BERT. Moreover, four standardisation methods, three distance measures and four clustering methods were evaluated. The analysis was performed on two corpora of texts in Polish classified into subjects. The Adjusted Mutual Information (AMI) metric was used to verify the quality of clustering results. The performed experiments show that Skipgram models with n-grams character embedding, built on KGR10 corpus and provided by Clarin-PL, outperforms other publicly available models for Polish. Moreover, presented results suggest that Yeo–Johnson transformation for document vectors standardisation and Agglomerative Clustering with a cosine distance should be used for grouping of text documents.
%R 10.26615/978-954-452-056-4_149
%U https://aclanthology.org/R19-1149
%U https://doi.org/10.26615/978-954-452-056-4_149
%P 1304-1311
Markdown (Informal)
[Evaluation of vector embedding models in clustering of text documents](https://aclanthology.org/R19-1149) (Walkowiak & Gniewkowski, RANLP 2019)
ACL