@inproceedings{mao-2022-fine,
title = "Fine-Tuning Pre-trained Transformers into Decaying Fast Weights",
author = "Mao, Huanru Henry",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.697",
doi = "10.18653/v1/2022.emnlp-main.697",
pages = "10236--10242",
abstract = "Autoregressive Transformers are strong language models but incur O(T) complexity during per-token generation due to the self-attention mechanism. Recent work proposes kernel-based methods to approximate causal self-attention by replacing it with recurrent formulations with various update rules and feature maps to achieve O(1) time and memory complexity. We explore these approaches and find that they are unnecessarily complex, and propose a simple alternative - decaying fast weights - that runs fast on GPU, outperforms prior methods, and retains 99{\%} of attention{'}s performance for GPT-2. We also show competitive performance on WikiText-103 against more complex attention substitutes.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mao-2022-fine">
<titleInfo>
<title>Fine-Tuning Pre-trained Transformers into Decaying Fast Weights</title>
</titleInfo>
<name type="personal">
<namePart type="given">Huanru</namePart>
<namePart type="given">Henry</namePart>
<namePart type="family">Mao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Autoregressive Transformers are strong language models but incur O(T) complexity during per-token generation due to the self-attention mechanism. Recent work proposes kernel-based methods to approximate causal self-attention by replacing it with recurrent formulations with various update rules and feature maps to achieve O(1) time and memory complexity. We explore these approaches and find that they are unnecessarily complex, and propose a simple alternative - decaying fast weights - that runs fast on GPU, outperforms prior methods, and retains 99% of attention’s performance for GPT-2. We also show competitive performance on WikiText-103 against more complex attention substitutes.</abstract>
<identifier type="citekey">mao-2022-fine</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-main.697</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-main.697</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>10236</start>
<end>10242</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Fine-Tuning Pre-trained Transformers into Decaying Fast Weights
%A Mao, Huanru Henry
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F mao-2022-fine
%X Autoregressive Transformers are strong language models but incur O(T) complexity during per-token generation due to the self-attention mechanism. Recent work proposes kernel-based methods to approximate causal self-attention by replacing it with recurrent formulations with various update rules and feature maps to achieve O(1) time and memory complexity. We explore these approaches and find that they are unnecessarily complex, and propose a simple alternative - decaying fast weights - that runs fast on GPU, outperforms prior methods, and retains 99% of attention’s performance for GPT-2. We also show competitive performance on WikiText-103 against more complex attention substitutes.
%R 10.18653/v1/2022.emnlp-main.697
%U https://aclanthology.org/2022.emnlp-main.697
%U https://doi.org/10.18653/v1/2022.emnlp-main.697
%P 10236-10242
Markdown (Informal)
[Fine-Tuning Pre-trained Transformers into Decaying Fast Weights](https://aclanthology.org/2022.emnlp-main.697) (Mao, EMNLP 2022)
ACL