@inproceedings{navali-etal-2020-word,
title = "Word Embedding Binarization with Semantic Information Preservation",
author = "Navali, Samarth and
Sherki, Praneet and
Inturi, Ramesh and
Vala, Vanraj",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.108",
doi = "10.18653/v1/2020.coling-main.108",
pages = "1256--1265",
abstract = "With growing applications of Machine Learning in daily lives Natural Language Processing (NLP) has emerged as a heavily researched area. Finding its applications in tasks ranging from simple Q/A chatbots to Fully fledged conversational AI, NLP models are vital. Word and Sentence embedding are one of the most common starting points of any NLP task. A word embedding represents a given word in a predefined vector-space while maintaining vector relations with similar or dis-similar entities. As such different pretrained embedding such as Word2Vec, GloVe, fasttext have been developed. These embedding generated on millions of words are however very large in terms of size. Having embedding with floating point precision also makes the downstream evaluation slow. In this paper we present a novel method to convert continuous embedding to its binary representation, thus reducing the overall size of the embedding while keeping the semantic and relational knowledge intact. This will facilitate an option of porting such big embedding onto devices where space is limited. We also present different approaches suitable for different downstream tasks based on the requirement of contextual and semantic information. Experiments have shown comparable result in downstream tasks with 7 to 15 times reduction in file size and about 5 {\%} change in evaluation parameters.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="navali-etal-2020-word">
<titleInfo>
<title>Word Embedding Binarization with Semantic Information Preservation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Samarth</namePart>
<namePart type="family">Navali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Praneet</namePart>
<namePart type="family">Sherki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramesh</namePart>
<namePart type="family">Inturi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vanraj</namePart>
<namePart type="family">Vala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With growing applications of Machine Learning in daily lives Natural Language Processing (NLP) has emerged as a heavily researched area. Finding its applications in tasks ranging from simple Q/A chatbots to Fully fledged conversational AI, NLP models are vital. Word and Sentence embedding are one of the most common starting points of any NLP task. A word embedding represents a given word in a predefined vector-space while maintaining vector relations with similar or dis-similar entities. As such different pretrained embedding such as Word2Vec, GloVe, fasttext have been developed. These embedding generated on millions of words are however very large in terms of size. Having embedding with floating point precision also makes the downstream evaluation slow. In this paper we present a novel method to convert continuous embedding to its binary representation, thus reducing the overall size of the embedding while keeping the semantic and relational knowledge intact. This will facilitate an option of porting such big embedding onto devices where space is limited. We also present different approaches suitable for different downstream tasks based on the requirement of contextual and semantic information. Experiments have shown comparable result in downstream tasks with 7 to 15 times reduction in file size and about 5 % change in evaluation parameters.</abstract>
<identifier type="citekey">navali-etal-2020-word</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.108</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.108</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>1256</start>
<end>1265</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Word Embedding Binarization with Semantic Information Preservation
%A Navali, Samarth
%A Sherki, Praneet
%A Inturi, Ramesh
%A Vala, Vanraj
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F navali-etal-2020-word
%X With growing applications of Machine Learning in daily lives Natural Language Processing (NLP) has emerged as a heavily researched area. Finding its applications in tasks ranging from simple Q/A chatbots to Fully fledged conversational AI, NLP models are vital. Word and Sentence embedding are one of the most common starting points of any NLP task. A word embedding represents a given word in a predefined vector-space while maintaining vector relations with similar or dis-similar entities. As such different pretrained embedding such as Word2Vec, GloVe, fasttext have been developed. These embedding generated on millions of words are however very large in terms of size. Having embedding with floating point precision also makes the downstream evaluation slow. In this paper we present a novel method to convert continuous embedding to its binary representation, thus reducing the overall size of the embedding while keeping the semantic and relational knowledge intact. This will facilitate an option of porting such big embedding onto devices where space is limited. We also present different approaches suitable for different downstream tasks based on the requirement of contextual and semantic information. Experiments have shown comparable result in downstream tasks with 7 to 15 times reduction in file size and about 5 % change in evaluation parameters.
%R 10.18653/v1/2020.coling-main.108
%U https://aclanthology.org/2020.coling-main.108
%U https://doi.org/10.18653/v1/2020.coling-main.108
%P 1256-1265
Markdown (Informal)
[Word Embedding Binarization with Semantic Information Preservation](https://aclanthology.org/2020.coling-main.108) (Navali et al., COLING 2020)
ACL