@inproceedings{prasanna-etal-2020-bert,
title = "{W}hen {BERT} {P}lays the {L}ottery, {A}ll {T}ickets {A}re {W}inning",
author = "Prasanna, Sai and
Rogers, Anna and
Rumshisky, Anna",
editor = "Webber, Bonnie and
Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.emnlp-main.259",
doi = "10.18653/v1/2020.emnlp-main.259",
pages = "3208--3229",
abstract = "Large Transformer-based models were shown to be reducible to a smaller number of self-attention heads and layers. We consider this phenomenon from the perspective of the lottery ticket hypothesis, using both structured and magnitude pruning. For fine-tuned BERT, we show that (a) it is possible to find subnetworks achieving performance that is comparable with that of the full model, and (b) similarly-sized subnetworks sampled from the rest of the model perform worse. Strikingly, with structured pruning even the worst possible subnetworks remain highly trainable, indicating that most pre-trained BERT weights are potentially useful. We also study the {``}good{''} subnetworks to see if their success can be attributed to superior linguistic knowledge, but find them unstable, and not explained by meaningful self-attention patterns.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="prasanna-etal-2020-bert">
<titleInfo>
<title>When BERT Plays the Lottery, All Tickets Are Winning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sai</namePart>
<namePart type="family">Prasanna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rumshisky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bonnie</namePart>
<namePart type="family">Webber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large Transformer-based models were shown to be reducible to a smaller number of self-attention heads and layers. We consider this phenomenon from the perspective of the lottery ticket hypothesis, using both structured and magnitude pruning. For fine-tuned BERT, we show that (a) it is possible to find subnetworks achieving performance that is comparable with that of the full model, and (b) similarly-sized subnetworks sampled from the rest of the model perform worse. Strikingly, with structured pruning even the worst possible subnetworks remain highly trainable, indicating that most pre-trained BERT weights are potentially useful. We also study the “good” subnetworks to see if their success can be attributed to superior linguistic knowledge, but find them unstable, and not explained by meaningful self-attention patterns.</abstract>
<identifier type="citekey">prasanna-etal-2020-bert</identifier>
<identifier type="doi">10.18653/v1/2020.emnlp-main.259</identifier>
<location>
<url>https://aclanthology.org/2020.emnlp-main.259</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>3208</start>
<end>3229</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T When BERT Plays the Lottery, All Tickets Are Winning
%A Prasanna, Sai
%A Rogers, Anna
%A Rumshisky, Anna
%Y Webber, Bonnie
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F prasanna-etal-2020-bert
%X Large Transformer-based models were shown to be reducible to a smaller number of self-attention heads and layers. We consider this phenomenon from the perspective of the lottery ticket hypothesis, using both structured and magnitude pruning. For fine-tuned BERT, we show that (a) it is possible to find subnetworks achieving performance that is comparable with that of the full model, and (b) similarly-sized subnetworks sampled from the rest of the model perform worse. Strikingly, with structured pruning even the worst possible subnetworks remain highly trainable, indicating that most pre-trained BERT weights are potentially useful. We also study the “good” subnetworks to see if their success can be attributed to superior linguistic knowledge, but find them unstable, and not explained by meaningful self-attention patterns.
%R 10.18653/v1/2020.emnlp-main.259
%U https://aclanthology.org/2020.emnlp-main.259
%U https://doi.org/10.18653/v1/2020.emnlp-main.259
%P 3208-3229
Markdown (Informal)
[When BERT Plays the Lottery, All Tickets Are Winning](https://aclanthology.org/2020.emnlp-main.259) (Prasanna et al., EMNLP 2020)
ACL
- Sai Prasanna, Anna Rogers, and Anna Rumshisky. 2020. When BERT Plays the Lottery, All Tickets Are Winning. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 3208–3229, Online. Association for Computational Linguistics.