@inproceedings{rui-miura-2025-forgetter,
title = "{FORGETTER} with forgetful hyperparameters and recurring sleeps can continue to learn beyond normal overtfitting limits",
author = "Rui, Yamamoto and
Miura, Keiji",
editor = "Charpentier, Lucas and
Choshen, Leshem and
Cotterell, Ryan and
Gul, Mustafa Omer and
Hu, Michael Y. and
Liu, Jing and
Jumelet, Jaap and
Linzen, Tal and
Mueller, Aaron and
Ross, Candace and
Shah, Raj Sanjay and
Warstadt, Alex and
Wilcox, Ethan Gotlieb and
Williams, Adina",
booktitle = "Proceedings of the First BabyLM Workshop",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.babylm-main.7/",
pages = "91--99",
ISBN = "TODO",
abstract = "LLMs suffer from considerable computational costs in training.A more biologically plausible curriculum learning may help to decrease the learning costs.Here we propose a FORGETTER training algorithm,in which a model forgets the variables for optimization after a sleepand the hyperparameters are set toward forgetting memory:rather large weight decay and learning rates as well as small but optimized batch sizes.By limiting minGemma model to 512 input length and speeding up the development cycle,we compared normal and FORGETTER learning algorithms by using more than a thousand different models.Specifically, we found and utilized the ``120-rule'' that the models with about 120 (Query) heads in total, irrespective of the head number per layer, outperform.The improvement by using the FORGETTER algorithm is far bigger than that by optimizing the model structure.Specifically, FORGETTER models can learn beyond the data size where the normal learning overfits.The FORGETTER also works for CIFAR10 image classification.These results suggest that forgetting can be beneficial for pretraining deep neural networks by avoiding overfitting."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rui-miura-2025-forgetter">
<titleInfo>
<title>FORGETTER with forgetful hyperparameters and recurring sleeps can continue to learn beyond normal overtfitting limits</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yamamoto</namePart>
<namePart type="family">Rui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keiji</namePart>
<namePart type="family">Miura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First BabyLM Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucas</namePart>
<namePart type="family">Charpentier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leshem</namePart>
<namePart type="family">Choshen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mustafa</namePart>
<namePart type="given">Omer</namePart>
<namePart type="family">Gul</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="given">Y</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaap</namePart>
<namePart type="family">Jumelet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tal</namePart>
<namePart type="family">Linzen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aaron</namePart>
<namePart type="family">Mueller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Candace</namePart>
<namePart type="family">Ross</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raj</namePart>
<namePart type="given">Sanjay</namePart>
<namePart type="family">Shah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Warstadt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ethan</namePart>
<namePart type="given">Gotlieb</namePart>
<namePart type="family">Wilcox</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adina</namePart>
<namePart type="family">Williams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">TODO</identifier>
</relatedItem>
<abstract>LLMs suffer from considerable computational costs in training.A more biologically plausible curriculum learning may help to decrease the learning costs.Here we propose a FORGETTER training algorithm,in which a model forgets the variables for optimization after a sleepand the hyperparameters are set toward forgetting memory:rather large weight decay and learning rates as well as small but optimized batch sizes.By limiting minGemma model to 512 input length and speeding up the development cycle,we compared normal and FORGETTER learning algorithms by using more than a thousand different models.Specifically, we found and utilized the “120-rule” that the models with about 120 (Query) heads in total, irrespective of the head number per layer, outperform.The improvement by using the FORGETTER algorithm is far bigger than that by optimizing the model structure.Specifically, FORGETTER models can learn beyond the data size where the normal learning overfits.The FORGETTER also works for CIFAR10 image classification.These results suggest that forgetting can be beneficial for pretraining deep neural networks by avoiding overfitting.</abstract>
<identifier type="citekey">rui-miura-2025-forgetter</identifier>
<location>
<url>https://aclanthology.org/2025.babylm-main.7/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>91</start>
<end>99</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FORGETTER with forgetful hyperparameters and recurring sleeps can continue to learn beyond normal overtfitting limits
%A Rui, Yamamoto
%A Miura, Keiji
%Y Charpentier, Lucas
%Y Choshen, Leshem
%Y Cotterell, Ryan
%Y Gul, Mustafa Omer
%Y Hu, Michael Y.
%Y Liu, Jing
%Y Jumelet, Jaap
%Y Linzen, Tal
%Y Mueller, Aaron
%Y Ross, Candace
%Y Shah, Raj Sanjay
%Y Warstadt, Alex
%Y Wilcox, Ethan Gotlieb
%Y Williams, Adina
%S Proceedings of the First BabyLM Workshop
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ TODO
%F rui-miura-2025-forgetter
%X LLMs suffer from considerable computational costs in training.A more biologically plausible curriculum learning may help to decrease the learning costs.Here we propose a FORGETTER training algorithm,in which a model forgets the variables for optimization after a sleepand the hyperparameters are set toward forgetting memory:rather large weight decay and learning rates as well as small but optimized batch sizes.By limiting minGemma model to 512 input length and speeding up the development cycle,we compared normal and FORGETTER learning algorithms by using more than a thousand different models.Specifically, we found and utilized the “120-rule” that the models with about 120 (Query) heads in total, irrespective of the head number per layer, outperform.The improvement by using the FORGETTER algorithm is far bigger than that by optimizing the model structure.Specifically, FORGETTER models can learn beyond the data size where the normal learning overfits.The FORGETTER also works for CIFAR10 image classification.These results suggest that forgetting can be beneficial for pretraining deep neural networks by avoiding overfitting.
%U https://aclanthology.org/2025.babylm-main.7/
%P 91-99
Markdown (Informal)
[FORGETTER with forgetful hyperparameters and recurring sleeps can continue to learn beyond normal overtfitting limits](https://aclanthology.org/2025.babylm-main.7/) (Rui & Miura, BabyLM 2025)
ACL