@inproceedings{arora-etal-2019-lstm,
title = "Does an {LSTM} forget more than a {CNN}? An empirical study of catastrophic forgetting in {NLP}",
author = "Arora, Gaurav and
Rahimi, Afshin and
Baldwin, Timothy",
editor = "Mistica, Meladel and
Piccardi, Massimo and
MacKinlay, Andrew",
booktitle = "Proceedings of the 17th Annual Workshop of the Australasian Language Technology Association",
month = "4--6 " # dec,
year = "2019",
address = "Sydney, Australia",
publisher = "Australasian Language Technology Association",
url = "https://aclanthology.org/U19-1011",
pages = "77--86",
abstract = "Catastrophic forgetting {---} whereby a model trained on one task is fine-tuned on a second, and in doing so, suffers a {``}catastrophic{''} drop in performance over the first task {---} is a hurdle in the development of better transfer learning techniques. Despite impressive progress in reducing catastrophic forgetting, we have limited understanding of how different architectures and hyper-parameters affect forgetting in a network. With this study, we aim to understand factors which cause forgetting during sequential training. Our primary finding is that CNNs forget less than LSTMs. We show that max-pooling is the underlying operation which helps CNNs alleviate forgetting compared to LSTMs. We also found that curriculum learning, placing a hard task towards the end of task sequence, reduces forgetting. We analysed the effect of fine-tuning contextual embeddings on catastrophic forgetting and found that using embeddings as feature extractor is preferable to fine-tuning in continual learning setup.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arora-etal-2019-lstm">
<titleInfo>
<title>Does an LSTM forget more than a CNN? An empirical study of catastrophic forgetting in NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gaurav</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Afshin</namePart>
<namePart type="family">Rahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Timothy</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-4–6 dec</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th Annual Workshop of the Australasian Language Technology Association</title>
</titleInfo>
<name type="personal">
<namePart type="given">Meladel</namePart>
<namePart type="family">Mistica</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Massimo</namePart>
<namePart type="family">Piccardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrew</namePart>
<namePart type="family">MacKinlay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Australasian Language Technology Association</publisher>
<place>
<placeTerm type="text">Sydney, Australia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Catastrophic forgetting — whereby a model trained on one task is fine-tuned on a second, and in doing so, suffers a “catastrophic” drop in performance over the first task — is a hurdle in the development of better transfer learning techniques. Despite impressive progress in reducing catastrophic forgetting, we have limited understanding of how different architectures and hyper-parameters affect forgetting in a network. With this study, we aim to understand factors which cause forgetting during sequential training. Our primary finding is that CNNs forget less than LSTMs. We show that max-pooling is the underlying operation which helps CNNs alleviate forgetting compared to LSTMs. We also found that curriculum learning, placing a hard task towards the end of task sequence, reduces forgetting. We analysed the effect of fine-tuning contextual embeddings on catastrophic forgetting and found that using embeddings as feature extractor is preferable to fine-tuning in continual learning setup.</abstract>
<identifier type="citekey">arora-etal-2019-lstm</identifier>
<location>
<url>https://aclanthology.org/U19-1011</url>
</location>
<part>
<date>2019-4–6 dec</date>
<extent unit="page">
<start>77</start>
<end>86</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Does an LSTM forget more than a CNN? An empirical study of catastrophic forgetting in NLP
%A Arora, Gaurav
%A Rahimi, Afshin
%A Baldwin, Timothy
%Y Mistica, Meladel
%Y Piccardi, Massimo
%Y MacKinlay, Andrew
%S Proceedings of the 17th Annual Workshop of the Australasian Language Technology Association
%D 2019
%8 4–6 dec
%I Australasian Language Technology Association
%C Sydney, Australia
%F arora-etal-2019-lstm
%X Catastrophic forgetting — whereby a model trained on one task is fine-tuned on a second, and in doing so, suffers a “catastrophic” drop in performance over the first task — is a hurdle in the development of better transfer learning techniques. Despite impressive progress in reducing catastrophic forgetting, we have limited understanding of how different architectures and hyper-parameters affect forgetting in a network. With this study, we aim to understand factors which cause forgetting during sequential training. Our primary finding is that CNNs forget less than LSTMs. We show that max-pooling is the underlying operation which helps CNNs alleviate forgetting compared to LSTMs. We also found that curriculum learning, placing a hard task towards the end of task sequence, reduces forgetting. We analysed the effect of fine-tuning contextual embeddings on catastrophic forgetting and found that using embeddings as feature extractor is preferable to fine-tuning in continual learning setup.
%U https://aclanthology.org/U19-1011
%P 77-86
Markdown (Informal)
[Does an LSTM forget more than a CNN? An empirical study of catastrophic forgetting in NLP](https://aclanthology.org/U19-1011) (Arora et al., ALTA 2019)
ACL