@inproceedings{levy-etal-2018-long,
    title = "Long Short-Term Memory as a Dynamically Computed Element-wise Weighted Sum",
    author = "Levy, Omer  and
      Lee, Kenton  and
      FitzGerald, Nicholas  and
      Zettlemoyer, Luke",
    editor = "Gurevych, Iryna  and
      Miyao, Yusuke",
    booktitle = "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
    month = jul,
    year = "2018",
    address = "Melbourne, Australia",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/P18-2116/",
    doi = "10.18653/v1/P18-2116",
    pages = "732--739",
    abstract = "LSTMs were introduced to combat vanishing gradients in simple RNNs by augmenting them with gated additive recurrent connections. We present an alternative view to explain the success of LSTMs: the gates themselves are versatile recurrent models that provide more representational power than previously appreciated. We do this by decoupling the LSTM{'}s gates from the embedded simple RNN, producing a new class of RNNs where the recurrence computes an element-wise weighted sum of context-independent functions of the input. Ablations on a range of problems demonstrate that the gating mechanism alone performs as well as an LSTM in most settings, strongly suggesting that the gates are doing much more in practice than just alleviating vanishing gradients."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="levy-etal-2018-long">
    <titleInfo>
        <title>Long Short-Term Memory as a Dynamically Computed Element-wise Weighted Sum</title>
    </titleInfo>
    <name type="personal">
        <namePart type="given">Omer</namePart>
        <namePart type="family">Levy</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Kenton</namePart>
        <namePart type="family">Lee</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Nicholas</namePart>
        <namePart type="family">FitzGerald</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Luke</namePart>
        <namePart type="family">Zettlemoyer</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <originInfo>
        <dateIssued>2018-07</dateIssued>
    </originInfo>
    <typeOfResource>text</typeOfResource>
    <relatedItem type="host">
        <titleInfo>
            <title>Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
        </titleInfo>
        <name type="personal">
            <namePart type="given">Iryna</namePart>
            <namePart type="family">Gurevych</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Yusuke</namePart>
            <namePart type="family">Miyao</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <originInfo>
            <publisher>Association for Computational Linguistics</publisher>
            <place>
                <placeTerm type="text">Melbourne, Australia</placeTerm>
            </place>
        </originInfo>
        <genre authority="marcgt">conference publication</genre>
    </relatedItem>
    <abstract>LSTMs were introduced to combat vanishing gradients in simple RNNs by augmenting them with gated additive recurrent connections. We present an alternative view to explain the success of LSTMs: the gates themselves are versatile recurrent models that provide more representational power than previously appreciated. We do this by decoupling the LSTM’s gates from the embedded simple RNN, producing a new class of RNNs where the recurrence computes an element-wise weighted sum of context-independent functions of the input. Ablations on a range of problems demonstrate that the gating mechanism alone performs as well as an LSTM in most settings, strongly suggesting that the gates are doing much more in practice than just alleviating vanishing gradients.</abstract>
    <identifier type="citekey">levy-etal-2018-long</identifier>
    <identifier type="doi">10.18653/v1/P18-2116</identifier>
    <location>
        <url>https://aclanthology.org/P18-2116/</url>
    </location>
    <part>
        <date>2018-07</date>
        <extent unit="page">
            <start>732</start>
            <end>739</end>
        </extent>
    </part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Long Short-Term Memory as a Dynamically Computed Element-wise Weighted Sum
%A Levy, Omer
%A Lee, Kenton
%A FitzGerald, Nicholas
%A Zettlemoyer, Luke
%Y Gurevych, Iryna
%Y Miyao, Yusuke
%S Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2018
%8 July
%I Association for Computational Linguistics
%C Melbourne, Australia
%F levy-etal-2018-long
%X LSTMs were introduced to combat vanishing gradients in simple RNNs by augmenting them with gated additive recurrent connections. We present an alternative view to explain the success of LSTMs: the gates themselves are versatile recurrent models that provide more representational power than previously appreciated. We do this by decoupling the LSTM’s gates from the embedded simple RNN, producing a new class of RNNs where the recurrence computes an element-wise weighted sum of context-independent functions of the input. Ablations on a range of problems demonstrate that the gating mechanism alone performs as well as an LSTM in most settings, strongly suggesting that the gates are doing much more in practice than just alleviating vanishing gradients.
%R 10.18653/v1/P18-2116
%U https://aclanthology.org/P18-2116/
%U https://doi.org/10.18653/v1/P18-2116
%P 732-739
Markdown (Informal)
[Long Short-Term Memory as a Dynamically Computed Element-wise Weighted Sum](https://aclanthology.org/P18-2116/) (Levy et al., ACL 2018)
ACL