@inproceedings{gupta-etal-2019-improving,
title = "Improving Word Embeddings Using Kernel {PCA}",
author = {Gupta, Vishwani and
Giesselbach, Sven and
R{\"u}ping, Stefan and
Bauckhage, Christian},
editor = "Augenstein, Isabelle and
Gella, Spandana and
Ruder, Sebastian and
Kann, Katharina and
Can, Burcu and
Welbl, Johannes and
Conneau, Alexis and
Ren, Xiang and
Rei, Marek",
booktitle = "Proceedings of the 4th Workshop on Representation Learning for NLP (RepL4NLP-2019)",
month = aug,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-4323",
doi = "10.18653/v1/W19-4323",
pages = "200--208",
abstract = "Word-based embedding approaches such as Word2Vec capture the meaning of words and relations between them, particularly well when trained with large text collections; however, they fail to do so with small datasets. Extensions such as fastText reduce the amount of data needed slightly, however, the joint task of learning meaningful morphology, syntactic and semantic representations still requires a lot of data. In this paper, we introduce a new approach to warm-start embedding models with morphological information, in order to reduce training time and enhance their performance. We use word embeddings generated using both word2vec and fastText models and enrich them with morphological information of words, derived from kernel principal component analysis (KPCA) of word similarity matrices. This can be seen as explicitly feeding the network morphological similarities and letting it learn semantic and syntactic similarities. Evaluating our models on word similarity and analogy tasks in English and German, we find that they not only achieve higher accuracies than the original skip-gram and fastText models but also require significantly less training data and time. Another benefit of our approach is that it is capable of generating a high-quality representation of infrequent words as, for example, found in very recent news articles with rapidly changing vocabularies. Lastly, we evaluate the different models on a downstream sentence classification task in which a CNN model is initialized with our embeddings and find promising results.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gupta-etal-2019-improving">
<titleInfo>
<title>Improving Word Embeddings Using Kernel PCA</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vishwani</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sven</namePart>
<namePart type="family">Giesselbach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefan</namePart>
<namePart type="family">Rüping</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Bauckhage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Representation Learning for NLP (RepL4NLP-2019)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Spandana</namePart>
<namePart type="family">Gella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Ruder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katharina</namePart>
<namePart type="family">Kann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Burcu</namePart>
<namePart type="family">Can</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johannes</namePart>
<namePart type="family">Welbl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexis</namePart>
<namePart type="family">Conneau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marek</namePart>
<namePart type="family">Rei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Word-based embedding approaches such as Word2Vec capture the meaning of words and relations between them, particularly well when trained with large text collections; however, they fail to do so with small datasets. Extensions such as fastText reduce the amount of data needed slightly, however, the joint task of learning meaningful morphology, syntactic and semantic representations still requires a lot of data. In this paper, we introduce a new approach to warm-start embedding models with morphological information, in order to reduce training time and enhance their performance. We use word embeddings generated using both word2vec and fastText models and enrich them with morphological information of words, derived from kernel principal component analysis (KPCA) of word similarity matrices. This can be seen as explicitly feeding the network morphological similarities and letting it learn semantic and syntactic similarities. Evaluating our models on word similarity and analogy tasks in English and German, we find that they not only achieve higher accuracies than the original skip-gram and fastText models but also require significantly less training data and time. Another benefit of our approach is that it is capable of generating a high-quality representation of infrequent words as, for example, found in very recent news articles with rapidly changing vocabularies. Lastly, we evaluate the different models on a downstream sentence classification task in which a CNN model is initialized with our embeddings and find promising results.</abstract>
<identifier type="citekey">gupta-etal-2019-improving</identifier>
<identifier type="doi">10.18653/v1/W19-4323</identifier>
<location>
<url>https://aclanthology.org/W19-4323</url>
</location>
<part>
<date>2019-08</date>
<extent unit="page">
<start>200</start>
<end>208</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Word Embeddings Using Kernel PCA
%A Gupta, Vishwani
%A Giesselbach, Sven
%A Rüping, Stefan
%A Bauckhage, Christian
%Y Augenstein, Isabelle
%Y Gella, Spandana
%Y Ruder, Sebastian
%Y Kann, Katharina
%Y Can, Burcu
%Y Welbl, Johannes
%Y Conneau, Alexis
%Y Ren, Xiang
%Y Rei, Marek
%S Proceedings of the 4th Workshop on Representation Learning for NLP (RepL4NLP-2019)
%D 2019
%8 August
%I Association for Computational Linguistics
%C Florence, Italy
%F gupta-etal-2019-improving
%X Word-based embedding approaches such as Word2Vec capture the meaning of words and relations between them, particularly well when trained with large text collections; however, they fail to do so with small datasets. Extensions such as fastText reduce the amount of data needed slightly, however, the joint task of learning meaningful morphology, syntactic and semantic representations still requires a lot of data. In this paper, we introduce a new approach to warm-start embedding models with morphological information, in order to reduce training time and enhance their performance. We use word embeddings generated using both word2vec and fastText models and enrich them with morphological information of words, derived from kernel principal component analysis (KPCA) of word similarity matrices. This can be seen as explicitly feeding the network morphological similarities and letting it learn semantic and syntactic similarities. Evaluating our models on word similarity and analogy tasks in English and German, we find that they not only achieve higher accuracies than the original skip-gram and fastText models but also require significantly less training data and time. Another benefit of our approach is that it is capable of generating a high-quality representation of infrequent words as, for example, found in very recent news articles with rapidly changing vocabularies. Lastly, we evaluate the different models on a downstream sentence classification task in which a CNN model is initialized with our embeddings and find promising results.
%R 10.18653/v1/W19-4323
%U https://aclanthology.org/W19-4323
%U https://doi.org/10.18653/v1/W19-4323
%P 200-208
Markdown (Informal)
[Improving Word Embeddings Using Kernel PCA](https://aclanthology.org/W19-4323) (Gupta et al., RepL4NLP 2019)
ACL
- Vishwani Gupta, Sven Giesselbach, Stefan Rüping, and Christian Bauckhage. 2019. Improving Word Embeddings Using Kernel PCA. In Proceedings of the 4th Workshop on Representation Learning for NLP (RepL4NLP-2019), pages 200–208, Florence, Italy. Association for Computational Linguistics.