@inproceedings{lioutas-etal-2020-improving,
title = "{I}mproving {W}ord {E}mbedding {F}actorization for {C}ompression {U}sing {D}istilled {N}onlinear {N}eural {D}ecomposition",
author = "Lioutas, Vasileios and
Rashid, Ahmad and
Kumar, Krtin and
Haidar, Md. Akmal and
Rezagholizadeh, Mehdi",
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.findings-emnlp.250",
doi = "10.18653/v1/2020.findings-emnlp.250",
pages = "2774--2784",
abstract = "Word-embeddings are vital components of Natural Language Processing (NLP) models and have been extensively explored. However, they consume a lot of memory which poses a challenge for edge deployment. Embedding matrices, typically, contain most of the parameters for language models and about a third for machine translation systems. In this paper, we propose Distilled Embedding, an (input/output) embedding compression method based on low-rank matrix decomposition and knowledge distillation. First, we initialize the weights of our decomposed matrices by learning to reconstruct the full pre-trained word-embedding and then fine-tune end-to-end, employing knowledge distillation on the factorized embedding. We conduct extensive experiments with various compression rates on machine translation and language modeling, using different data-sets with a shared word-embedding matrix for both embedding and vocabulary projection matrices. We show that the proposed technique is simple to replicate, with one fixed parameter controlling compression size, has higher BLEU score on translation and lower perplexity on language modeling compared to complex, difficult to tune state-of-the-art methods.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lioutas-etal-2020-improving">
<titleInfo>
<title>Improving Word Embedding Factorization for Compression Using Distilled Nonlinear Neural Decomposition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vasileios</namePart>
<namePart type="family">Lioutas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmad</namePart>
<namePart type="family">Rashid</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Krtin</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Akmal</namePart>
<namePart type="family">Haidar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehdi</namePart>
<namePart type="family">Rezagholizadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2020</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Word-embeddings are vital components of Natural Language Processing (NLP) models and have been extensively explored. However, they consume a lot of memory which poses a challenge for edge deployment. Embedding matrices, typically, contain most of the parameters for language models and about a third for machine translation systems. In this paper, we propose Distilled Embedding, an (input/output) embedding compression method based on low-rank matrix decomposition and knowledge distillation. First, we initialize the weights of our decomposed matrices by learning to reconstruct the full pre-trained word-embedding and then fine-tune end-to-end, employing knowledge distillation on the factorized embedding. We conduct extensive experiments with various compression rates on machine translation and language modeling, using different data-sets with a shared word-embedding matrix for both embedding and vocabulary projection matrices. We show that the proposed technique is simple to replicate, with one fixed parameter controlling compression size, has higher BLEU score on translation and lower perplexity on language modeling compared to complex, difficult to tune state-of-the-art methods.</abstract>
<identifier type="citekey">lioutas-etal-2020-improving</identifier>
<identifier type="doi">10.18653/v1/2020.findings-emnlp.250</identifier>
<location>
<url>https://aclanthology.org/2020.findings-emnlp.250</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>2774</start>
<end>2784</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Word Embedding Factorization for Compression Using Distilled Nonlinear Neural Decomposition
%A Lioutas, Vasileios
%A Rashid, Ahmad
%A Kumar, Krtin
%A Haidar, Md. Akmal
%A Rezagholizadeh, Mehdi
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Findings of the Association for Computational Linguistics: EMNLP 2020
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F lioutas-etal-2020-improving
%X Word-embeddings are vital components of Natural Language Processing (NLP) models and have been extensively explored. However, they consume a lot of memory which poses a challenge for edge deployment. Embedding matrices, typically, contain most of the parameters for language models and about a third for machine translation systems. In this paper, we propose Distilled Embedding, an (input/output) embedding compression method based on low-rank matrix decomposition and knowledge distillation. First, we initialize the weights of our decomposed matrices by learning to reconstruct the full pre-trained word-embedding and then fine-tune end-to-end, employing knowledge distillation on the factorized embedding. We conduct extensive experiments with various compression rates on machine translation and language modeling, using different data-sets with a shared word-embedding matrix for both embedding and vocabulary projection matrices. We show that the proposed technique is simple to replicate, with one fixed parameter controlling compression size, has higher BLEU score on translation and lower perplexity on language modeling compared to complex, difficult to tune state-of-the-art methods.
%R 10.18653/v1/2020.findings-emnlp.250
%U https://aclanthology.org/2020.findings-emnlp.250
%U https://doi.org/10.18653/v1/2020.findings-emnlp.250
%P 2774-2784
Markdown (Informal)
[Improving Word Embedding Factorization for Compression Using Distilled Nonlinear Neural Decomposition](https://aclanthology.org/2020.findings-emnlp.250) (Lioutas et al., Findings 2020)
ACL