@inproceedings{behnke-heafield-2020-losing,
title = "Losing Heads in the Lottery: Pruning Transformer Attention in Neural Machine Translation",
author = "Behnke, Maximiliana and
Heafield, Kenneth",
editor = "Webber, Bonnie and
Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.emnlp-main.211",
doi = "10.18653/v1/2020.emnlp-main.211",
pages = "2664--2674",
abstract = "The attention mechanism is the crucial component of the transformer architecture. Recent research shows that most attention heads are not confident in their decisions and can be pruned. However, removing them before training a model results in lower quality. In this paper, we apply the lottery ticket hypothesis to prune heads in the early stages of training. Our experiments on machine translation show that it is possible to remove up to three-quarters of attention heads from transformer-big during early training with an average -0.1 change in BLEU for Turkish→English. The pruned model is 1.5 times as fast at inference, albeit at the cost of longer training. Our method is complementary to other approaches, such as teacher-student, with English→German student model gaining an additional 10{\%} speed-up with 75{\%} encoder attention removed and 0.2 BLEU loss.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="behnke-heafield-2020-losing">
<titleInfo>
<title>Losing Heads in the Lottery: Pruning Transformer Attention in Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maximiliana</namePart>
<namePart type="family">Behnke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenneth</namePart>
<namePart type="family">Heafield</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bonnie</namePart>
<namePart type="family">Webber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The attention mechanism is the crucial component of the transformer architecture. Recent research shows that most attention heads are not confident in their decisions and can be pruned. However, removing them before training a model results in lower quality. In this paper, we apply the lottery ticket hypothesis to prune heads in the early stages of training. Our experiments on machine translation show that it is possible to remove up to three-quarters of attention heads from transformer-big during early training with an average -0.1 change in BLEU for Turkish→English. The pruned model is 1.5 times as fast at inference, albeit at the cost of longer training. Our method is complementary to other approaches, such as teacher-student, with English→German student model gaining an additional 10% speed-up with 75% encoder attention removed and 0.2 BLEU loss.</abstract>
<identifier type="citekey">behnke-heafield-2020-losing</identifier>
<identifier type="doi">10.18653/v1/2020.emnlp-main.211</identifier>
<location>
<url>https://aclanthology.org/2020.emnlp-main.211</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>2664</start>
<end>2674</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Losing Heads in the Lottery: Pruning Transformer Attention in Neural Machine Translation
%A Behnke, Maximiliana
%A Heafield, Kenneth
%Y Webber, Bonnie
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F behnke-heafield-2020-losing
%X The attention mechanism is the crucial component of the transformer architecture. Recent research shows that most attention heads are not confident in their decisions and can be pruned. However, removing them before training a model results in lower quality. In this paper, we apply the lottery ticket hypothesis to prune heads in the early stages of training. Our experiments on machine translation show that it is possible to remove up to three-quarters of attention heads from transformer-big during early training with an average -0.1 change in BLEU for Turkish→English. The pruned model is 1.5 times as fast at inference, albeit at the cost of longer training. Our method is complementary to other approaches, such as teacher-student, with English→German student model gaining an additional 10% speed-up with 75% encoder attention removed and 0.2 BLEU loss.
%R 10.18653/v1/2020.emnlp-main.211
%U https://aclanthology.org/2020.emnlp-main.211
%U https://doi.org/10.18653/v1/2020.emnlp-main.211
%P 2664-2674
Markdown (Informal)
[Losing Heads in the Lottery: Pruning Transformer Attention in Neural Machine Translation](https://aclanthology.org/2020.emnlp-main.211) (Behnke & Heafield, EMNLP 2020)
ACL