@inproceedings{rae-razavi-2020-transformers,
title = "Do Transformers Need Deep Long-Range Memory?",
author = "Rae, Jack and
Razavi, Ali",
editor = "Jurafsky, Dan and
Chai, Joyce and
Schluter, Natalie and
Tetreault, Joel",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-main.672",
doi = "10.18653/v1/2020.acl-main.672",
pages = "7524--7529",
abstract = "Deep attention models have advanced the modelling of sequential data across many domains. For language modelling in particular, the Transformer-XL {---} a Transformer augmented with a long-range memory of past activations {---} has been shown to be state-of-the-art across a variety of well-studied benchmarks. The Transformer-XL incorporates a long-range memory at every layer of the network, which renders its state to be thousands of times larger than RNN predecessors. However it is unclear whether this is necessary. We perform a set of interventions to show that comparable performance can be obtained with 6X fewer long range memories and better performance can be obtained by limiting the range of attention in lower layers of the network.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rae-razavi-2020-transformers">
<titleInfo>
<title>Do Transformers Need Deep Long-Range Memory?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jack</namePart>
<namePart type="family">Rae</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Razavi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Jurafsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Chai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Natalie</namePart>
<namePart type="family">Schluter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joel</namePart>
<namePart type="family">Tetreault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Deep attention models have advanced the modelling of sequential data across many domains. For language modelling in particular, the Transformer-XL — a Transformer augmented with a long-range memory of past activations — has been shown to be state-of-the-art across a variety of well-studied benchmarks. The Transformer-XL incorporates a long-range memory at every layer of the network, which renders its state to be thousands of times larger than RNN predecessors. However it is unclear whether this is necessary. We perform a set of interventions to show that comparable performance can be obtained with 6X fewer long range memories and better performance can be obtained by limiting the range of attention in lower layers of the network.</abstract>
<identifier type="citekey">rae-razavi-2020-transformers</identifier>
<identifier type="doi">10.18653/v1/2020.acl-main.672</identifier>
<location>
<url>https://aclanthology.org/2020.acl-main.672</url>
</location>
<part>
<date>2020-07</date>
<extent unit="page">
<start>7524</start>
<end>7529</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Do Transformers Need Deep Long-Range Memory?
%A Rae, Jack
%A Razavi, Ali
%Y Jurafsky, Dan
%Y Chai, Joyce
%Y Schluter, Natalie
%Y Tetreault, Joel
%S Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics
%D 2020
%8 July
%I Association for Computational Linguistics
%C Online
%F rae-razavi-2020-transformers
%X Deep attention models have advanced the modelling of sequential data across many domains. For language modelling in particular, the Transformer-XL — a Transformer augmented with a long-range memory of past activations — has been shown to be state-of-the-art across a variety of well-studied benchmarks. The Transformer-XL incorporates a long-range memory at every layer of the network, which renders its state to be thousands of times larger than RNN predecessors. However it is unclear whether this is necessary. We perform a set of interventions to show that comparable performance can be obtained with 6X fewer long range memories and better performance can be obtained by limiting the range of attention in lower layers of the network.
%R 10.18653/v1/2020.acl-main.672
%U https://aclanthology.org/2020.acl-main.672
%U https://doi.org/10.18653/v1/2020.acl-main.672
%P 7524-7529
Markdown (Informal)
[Do Transformers Need Deep Long-Range Memory?](https://aclanthology.org/2020.acl-main.672) (Rae & Razavi, ACL 2020)
ACL
- Jack Rae and Ali Razavi. 2020. Do Transformers Need Deep Long-Range Memory?. In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 7524–7529, Online. Association for Computational Linguistics.