@inproceedings{adhikari-etal-2020-exploring,
title = "Exploring the Limits of Simple Learners in Knowledge Distillation for Document Classification with {D}oc{BERT}",
author = "Adhikari, Ashutosh and
Ram, Achyudh and
Tang, Raphael and
Hamilton, William L. and
Lin, Jimmy",
editor = "Gella, Spandana and
Welbl, Johannes and
Rei, Marek and
Petroni, Fabio and
Lewis, Patrick and
Strubell, Emma and
Seo, Minjoon and
Hajishirzi, Hannaneh",
booktitle = "Proceedings of the 5th Workshop on Representation Learning for NLP",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.repl4nlp-1.10",
doi = "10.18653/v1/2020.repl4nlp-1.10",
pages = "72--77",
abstract = "Fine-tuned variants of BERT are able to achieve state-of-the-art accuracy on many natural language processing tasks, although at significant computational costs. In this paper, we verify BERT{'}s effectiveness for document classification and investigate the extent to which BERT-level effectiveness can be obtained by different baselines, combined with knowledge distillation{---}a popular model compression method. The results show that BERT-level effectiveness can be achieved by a single-layer LSTM with at least $40\times$ fewer FLOPS and only ${\sim}3\%$ parameters. More importantly, this study analyzes the limits of knowledge distillation as we distill BERT{'}s knowledge all the way down to linear models{---}a relevant baseline for the task. We report substantial improvement in effectiveness for even the simplest models, as they capture the knowledge learnt by BERT.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="adhikari-etal-2020-exploring">
<titleInfo>
<title>Exploring the Limits of Simple Learners in Knowledge Distillation for Document Classification with DocBERT</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ashutosh</namePart>
<namePart type="family">Adhikari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Achyudh</namePart>
<namePart type="family">Ram</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raphael</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Hamilton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jimmy</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Workshop on Representation Learning for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Spandana</namePart>
<namePart type="family">Gella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johannes</namePart>
<namePart type="family">Welbl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marek</namePart>
<namePart type="family">Rei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabio</namePart>
<namePart type="family">Petroni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Lewis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emma</namePart>
<namePart type="family">Strubell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minjoon</namePart>
<namePart type="family">Seo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hannaneh</namePart>
<namePart type="family">Hajishirzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Fine-tuned variants of BERT are able to achieve state-of-the-art accuracy on many natural language processing tasks, although at significant computational costs. In this paper, we verify BERT’s effectiveness for document classification and investigate the extent to which BERT-level effectiveness can be obtained by different baselines, combined with knowledge distillation—a popular model compression method. The results show that BERT-level effectiveness can be achieved by a single-layer LSTM with at least 40\times fewer FLOPS and only \sim3% parameters. More importantly, this study analyzes the limits of knowledge distillation as we distill BERT’s knowledge all the way down to linear models—a relevant baseline for the task. We report substantial improvement in effectiveness for even the simplest models, as they capture the knowledge learnt by BERT.</abstract>
<identifier type="citekey">adhikari-etal-2020-exploring</identifier>
<identifier type="doi">10.18653/v1/2020.repl4nlp-1.10</identifier>
<location>
<url>https://aclanthology.org/2020.repl4nlp-1.10</url>
</location>
<part>
<date>2020-07</date>
<extent unit="page">
<start>72</start>
<end>77</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Exploring the Limits of Simple Learners in Knowledge Distillation for Document Classification with DocBERT
%A Adhikari, Ashutosh
%A Ram, Achyudh
%A Tang, Raphael
%A Hamilton, William L.
%A Lin, Jimmy
%Y Gella, Spandana
%Y Welbl, Johannes
%Y Rei, Marek
%Y Petroni, Fabio
%Y Lewis, Patrick
%Y Strubell, Emma
%Y Seo, Minjoon
%Y Hajishirzi, Hannaneh
%S Proceedings of the 5th Workshop on Representation Learning for NLP
%D 2020
%8 July
%I Association for Computational Linguistics
%C Online
%F adhikari-etal-2020-exploring
%X Fine-tuned variants of BERT are able to achieve state-of-the-art accuracy on many natural language processing tasks, although at significant computational costs. In this paper, we verify BERT’s effectiveness for document classification and investigate the extent to which BERT-level effectiveness can be obtained by different baselines, combined with knowledge distillation—a popular model compression method. The results show that BERT-level effectiveness can be achieved by a single-layer LSTM with at least 40\times fewer FLOPS and only \sim3% parameters. More importantly, this study analyzes the limits of knowledge distillation as we distill BERT’s knowledge all the way down to linear models—a relevant baseline for the task. We report substantial improvement in effectiveness for even the simplest models, as they capture the knowledge learnt by BERT.
%R 10.18653/v1/2020.repl4nlp-1.10
%U https://aclanthology.org/2020.repl4nlp-1.10
%U https://doi.org/10.18653/v1/2020.repl4nlp-1.10
%P 72-77
Markdown (Informal)
[Exploring the Limits of Simple Learners in Knowledge Distillation for Document Classification with DocBERT](https://aclanthology.org/2020.repl4nlp-1.10) (Adhikari et al., RepL4NLP 2020)
ACL