@inproceedings{elekes-etal-2018-resources,
title = "Resources to Examine the Quality of Word Embedding Models Trained on n-Gram Data",
author = {Elekes, {\'A}bel and
Englhardt, Adrian and
Sch{\"a}ler, Martin and
B{\"o}hm, Klemens},
editor = "Korhonen, Anna and
Titov, Ivan",
booktitle = "Proceedings of the 22nd Conference on Computational Natural Language Learning",
month = oct,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/K18-1041/",
doi = "10.18653/v1/K18-1041",
pages = "423--432",
abstract = "Word embeddings are powerful tools that facilitate better analysis of natural language. However, their quality highly depends on the resource used for training. There are various approaches relying on n-gram corpora, such as the Google n-gram corpus. However, n-gram corpora only offer a small window into the full text {--} 5 words for the Google corpus at best. This gives way to the concern whether the extracted word semantics are of high quality. In this paper, we address this concern with two contributions. First, we provide a resource containing 120 word-embedding models {--} one of the largest collection of embedding models. Furthermore, the resource contains the n-gramed versions of all used corpora, as well as our scripts used for corpus generation, model generation and evaluation. Second, we define a set of meaningful experiments allowing to evaluate the aforementioned quality differences. We conduct these experiments using our resource to show its usage and significance. The evaluation results confirm that one generally can expect high quality for n-grams with n {\ensuremath{>}} 3."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="elekes-etal-2018-resources">
<titleInfo>
<title>Resources to Examine the Quality of Word Embedding Models Trained on n-Gram Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ábel</namePart>
<namePart type="family">Elekes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adrian</namePart>
<namePart type="family">Englhardt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martin</namePart>
<namePart type="family">Schäler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Klemens</namePart>
<namePart type="family">Böhm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Korhonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Titov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Brussels, Belgium</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Word embeddings are powerful tools that facilitate better analysis of natural language. However, their quality highly depends on the resource used for training. There are various approaches relying on n-gram corpora, such as the Google n-gram corpus. However, n-gram corpora only offer a small window into the full text – 5 words for the Google corpus at best. This gives way to the concern whether the extracted word semantics are of high quality. In this paper, we address this concern with two contributions. First, we provide a resource containing 120 word-embedding models – one of the largest collection of embedding models. Furthermore, the resource contains the n-gramed versions of all used corpora, as well as our scripts used for corpus generation, model generation and evaluation. Second, we define a set of meaningful experiments allowing to evaluate the aforementioned quality differences. We conduct these experiments using our resource to show its usage and significance. The evaluation results confirm that one generally can expect high quality for n-grams with n \ensuremath> 3.</abstract>
<identifier type="citekey">elekes-etal-2018-resources</identifier>
<identifier type="doi">10.18653/v1/K18-1041</identifier>
<location>
<url>https://aclanthology.org/K18-1041/</url>
</location>
<part>
<date>2018-10</date>
<extent unit="page">
<start>423</start>
<end>432</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Resources to Examine the Quality of Word Embedding Models Trained on n-Gram Data
%A Elekes, Ábel
%A Englhardt, Adrian
%A Schäler, Martin
%A Böhm, Klemens
%Y Korhonen, Anna
%Y Titov, Ivan
%S Proceedings of the 22nd Conference on Computational Natural Language Learning
%D 2018
%8 October
%I Association for Computational Linguistics
%C Brussels, Belgium
%F elekes-etal-2018-resources
%X Word embeddings are powerful tools that facilitate better analysis of natural language. However, their quality highly depends on the resource used for training. There are various approaches relying on n-gram corpora, such as the Google n-gram corpus. However, n-gram corpora only offer a small window into the full text – 5 words for the Google corpus at best. This gives way to the concern whether the extracted word semantics are of high quality. In this paper, we address this concern with two contributions. First, we provide a resource containing 120 word-embedding models – one of the largest collection of embedding models. Furthermore, the resource contains the n-gramed versions of all used corpora, as well as our scripts used for corpus generation, model generation and evaluation. Second, we define a set of meaningful experiments allowing to evaluate the aforementioned quality differences. We conduct these experiments using our resource to show its usage and significance. The evaluation results confirm that one generally can expect high quality for n-grams with n \ensuremath> 3.
%R 10.18653/v1/K18-1041
%U https://aclanthology.org/K18-1041/
%U https://doi.org/10.18653/v1/K18-1041
%P 423-432
Markdown (Informal)
[Resources to Examine the Quality of Word Embedding Models Trained on n-Gram Data](https://aclanthology.org/K18-1041/) (Elekes et al., CoNLL 2018)
ACL