@inproceedings{wasserblat-etal-2020-exploring,
title = "Exploring the Boundaries of Low-Resource {BERT} Distillation",
author = "Wasserblat, Moshe and
Pereg, Oren and
Izsak, Peter",
editor = "Moosavi, Nafise Sadat and
Fan, Angela and
Shwartz, Vered and
Glava{\v{s}}, Goran and
Joty, Shafiq and
Wang, Alex and
Wolf, Thomas",
booktitle = "Proceedings of SustaiNLP: Workshop on Simple and Efficient Natural Language Processing",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.sustainlp-1.5",
doi = "10.18653/v1/2020.sustainlp-1.5",
pages = "35--40",
abstract = "In recent years, large pre-trained models have demonstrated state-of-the-art performance in many of NLP tasks. However, the deployment of these models on devices with limited resources is challenging due to the models{'} large computational consumption and memory requirements. Moreover, the need for a considerable amount of labeled training data also hinders real-world deployment scenarios. Model distillation has shown promising results for reducing model size, computational load and data efficiency. In this paper we test the boundaries of BERT model distillation in terms of model compression, inference efficiency and data scarcity. We show that classification tasks that require the capturing of general lexical semantics can be successfully distilled by very simple and efficient models and require relatively small amount of labeled training data. We also show that the distillation of large pre-trained models is more effective in real-life scenarios where limited amounts of labeled training are available.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wasserblat-etal-2020-exploring">
<titleInfo>
<title>Exploring the Boundaries of Low-Resource BERT Distillation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Moshe</namePart>
<namePart type="family">Wasserblat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oren</namePart>
<namePart type="family">Pereg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Izsak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of SustaiNLP: Workshop on Simple and Efficient Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nafise</namePart>
<namePart type="given">Sadat</namePart>
<namePart type="family">Moosavi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angela</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vered</namePart>
<namePart type="family">Shwartz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Goran</namePart>
<namePart type="family">Glavaš</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shafiq</namePart>
<namePart type="family">Joty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Wolf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In recent years, large pre-trained models have demonstrated state-of-the-art performance in many of NLP tasks. However, the deployment of these models on devices with limited resources is challenging due to the models’ large computational consumption and memory requirements. Moreover, the need for a considerable amount of labeled training data also hinders real-world deployment scenarios. Model distillation has shown promising results for reducing model size, computational load and data efficiency. In this paper we test the boundaries of BERT model distillation in terms of model compression, inference efficiency and data scarcity. We show that classification tasks that require the capturing of general lexical semantics can be successfully distilled by very simple and efficient models and require relatively small amount of labeled training data. We also show that the distillation of large pre-trained models is more effective in real-life scenarios where limited amounts of labeled training are available.</abstract>
<identifier type="citekey">wasserblat-etal-2020-exploring</identifier>
<identifier type="doi">10.18653/v1/2020.sustainlp-1.5</identifier>
<location>
<url>https://aclanthology.org/2020.sustainlp-1.5</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>35</start>
<end>40</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Exploring the Boundaries of Low-Resource BERT Distillation
%A Wasserblat, Moshe
%A Pereg, Oren
%A Izsak, Peter
%Y Moosavi, Nafise Sadat
%Y Fan, Angela
%Y Shwartz, Vered
%Y Glavaš, Goran
%Y Joty, Shafiq
%Y Wang, Alex
%Y Wolf, Thomas
%S Proceedings of SustaiNLP: Workshop on Simple and Efficient Natural Language Processing
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F wasserblat-etal-2020-exploring
%X In recent years, large pre-trained models have demonstrated state-of-the-art performance in many of NLP tasks. However, the deployment of these models on devices with limited resources is challenging due to the models’ large computational consumption and memory requirements. Moreover, the need for a considerable amount of labeled training data also hinders real-world deployment scenarios. Model distillation has shown promising results for reducing model size, computational load and data efficiency. In this paper we test the boundaries of BERT model distillation in terms of model compression, inference efficiency and data scarcity. We show that classification tasks that require the capturing of general lexical semantics can be successfully distilled by very simple and efficient models and require relatively small amount of labeled training data. We also show that the distillation of large pre-trained models is more effective in real-life scenarios where limited amounts of labeled training are available.
%R 10.18653/v1/2020.sustainlp-1.5
%U https://aclanthology.org/2020.sustainlp-1.5
%U https://doi.org/10.18653/v1/2020.sustainlp-1.5
%P 35-40
Markdown (Informal)
[Exploring the Boundaries of Low-Resource BERT Distillation](https://aclanthology.org/2020.sustainlp-1.5) (Wasserblat et al., sustainlp 2020)
ACL