@inproceedings{sekii-2025-flashback,
title = "Flashback: Memory Mechanism for Enhancing Memory Efficiency and Speed in Deep Sequential Models",
author = "Sekii, Taiki",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.575/",
pages = "8602--8611",
abstract = "In this study, we tackle three main challenges of deep sequential processing models in previous research: (1) memory degradation, (2) inaccurate gradient backpropagation, and (3) compatibility with next-token prediction. Specifically, to address (1-2), we define a Flashback property in which memory is preserved perfectly as an identity mapping of its stored value in a memory region until it is overwritten by a hidden state at a different time step. We propose a Flashback mechanism that satisfies this property in a fully differentiable, end-to-end manner. Further, to tackle (3), we propose architectures that incorporate the Flashback mechanism into Transformers and Mamba, enabling next-token prediction for language modeling tasks. In experiments, we trained on The Pile dataset, which includes diverse texts, to evaluate tradeoffs between commonsense reasoning accuracy, processing speed, and memory usage after introducing the Flashback mechanism into existing methods. The evaluations confirmed the effectiveness of the Flashback mechanism."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sekii-2025-flashback">
<titleInfo>
<title>Flashback: Memory Mechanism for Enhancing Memory Efficiency and Speed in Deep Sequential Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Taiki</namePart>
<namePart type="family">Sekii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this study, we tackle three main challenges of deep sequential processing models in previous research: (1) memory degradation, (2) inaccurate gradient backpropagation, and (3) compatibility with next-token prediction. Specifically, to address (1-2), we define a Flashback property in which memory is preserved perfectly as an identity mapping of its stored value in a memory region until it is overwritten by a hidden state at a different time step. We propose a Flashback mechanism that satisfies this property in a fully differentiable, end-to-end manner. Further, to tackle (3), we propose architectures that incorporate the Flashback mechanism into Transformers and Mamba, enabling next-token prediction for language modeling tasks. In experiments, we trained on The Pile dataset, which includes diverse texts, to evaluate tradeoffs between commonsense reasoning accuracy, processing speed, and memory usage after introducing the Flashback mechanism into existing methods. The evaluations confirmed the effectiveness of the Flashback mechanism.</abstract>
<identifier type="citekey">sekii-2025-flashback</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.575/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>8602</start>
<end>8611</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Flashback: Memory Mechanism for Enhancing Memory Efficiency and Speed in Deep Sequential Models
%A Sekii, Taiki
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F sekii-2025-flashback
%X In this study, we tackle three main challenges of deep sequential processing models in previous research: (1) memory degradation, (2) inaccurate gradient backpropagation, and (3) compatibility with next-token prediction. Specifically, to address (1-2), we define a Flashback property in which memory is preserved perfectly as an identity mapping of its stored value in a memory region until it is overwritten by a hidden state at a different time step. We propose a Flashback mechanism that satisfies this property in a fully differentiable, end-to-end manner. Further, to tackle (3), we propose architectures that incorporate the Flashback mechanism into Transformers and Mamba, enabling next-token prediction for language modeling tasks. In experiments, we trained on The Pile dataset, which includes diverse texts, to evaluate tradeoffs between commonsense reasoning accuracy, processing speed, and memory usage after introducing the Flashback mechanism into existing methods. The evaluations confirmed the effectiveness of the Flashback mechanism.
%U https://aclanthology.org/2025.coling-main.575/
%P 8602-8611
Markdown (Informal)
[Flashback: Memory Mechanism for Enhancing Memory Efficiency and Speed in Deep Sequential Models](https://aclanthology.org/2025.coling-main.575/) (Sekii, COLING 2025)
ACL