@inproceedings{abdi-etal-2026-evolutionary,
title = "Evolutionary Strategies at Scale lead to Catastrophic Forgetting",
author = "Abdi, Immanuel and
Gupta, Akshat and
Mok, Micah and
Lu, Alex and
Lee, Nicholas and
Anumanchipalli, Gopala",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 2: Short Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-short.18/",
pages = "194--204",
ISBN = "979-8-89176-391-3",
abstract = "One of biggest missing capabilities in state-of-the-art AI systems is the ability to learn continually after deployment. However, implementing an inference-time learning system has several challenges including the large memory requirement of gradient-based algorithms that are used to train state-of-the-art LLMs. Evolutionary Strategies (ES) have recently re-emerged as a gradient-free alternative to traditional learning algorithms and have shown encouraging performance on specific tasks in LLMs. In this paper, we perform a more comprehensive analysis of ES and specifically evaluate its forgetting curves when training for a larger number of update steps. We find that although ES is able to reach performance numbers closer to GRPO for math and reasoning tasks, it is accompanied by significant forgetting of prior abilities. We also show that the updates made using ES are much less sparse and have a larger l2 norm compared to corresponding GRPO updates, explaining the contrasting forgetting curves between the two algorithms. With this study, we aim to specifically highlight the issue of forgetting in gradient-free algorithms like ES and hope to inspire future work to mitigate these issues."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abdi-etal-2026-evolutionary">
<titleInfo>
<title>Evolutionary Strategies at Scale lead to Catastrophic Forgetting</title>
</titleInfo>
<name type="personal">
<namePart type="given">Immanuel</namePart>
<namePart type="family">Abdi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akshat</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Micah</namePart>
<namePart type="family">Mok</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gopala</namePart>
<namePart type="family">Anumanchipalli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-391-3</identifier>
</relatedItem>
<abstract>One of biggest missing capabilities in state-of-the-art AI systems is the ability to learn continually after deployment. However, implementing an inference-time learning system has several challenges including the large memory requirement of gradient-based algorithms that are used to train state-of-the-art LLMs. Evolutionary Strategies (ES) have recently re-emerged as a gradient-free alternative to traditional learning algorithms and have shown encouraging performance on specific tasks in LLMs. In this paper, we perform a more comprehensive analysis of ES and specifically evaluate its forgetting curves when training for a larger number of update steps. We find that although ES is able to reach performance numbers closer to GRPO for math and reasoning tasks, it is accompanied by significant forgetting of prior abilities. We also show that the updates made using ES are much less sparse and have a larger l2 norm compared to corresponding GRPO updates, explaining the contrasting forgetting curves between the two algorithms. With this study, we aim to specifically highlight the issue of forgetting in gradient-free algorithms like ES and hope to inspire future work to mitigate these issues.</abstract>
<identifier type="citekey">abdi-etal-2026-evolutionary</identifier>
<location>
<url>https://aclanthology.org/2026.acl-short.18/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>194</start>
<end>204</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evolutionary Strategies at Scale lead to Catastrophic Forgetting
%A Abdi, Immanuel
%A Gupta, Akshat
%A Mok, Micah
%A Lu, Alex
%A Lee, Nicholas
%A Anumanchipalli, Gopala
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-391-3
%F abdi-etal-2026-evolutionary
%X One of biggest missing capabilities in state-of-the-art AI systems is the ability to learn continually after deployment. However, implementing an inference-time learning system has several challenges including the large memory requirement of gradient-based algorithms that are used to train state-of-the-art LLMs. Evolutionary Strategies (ES) have recently re-emerged as a gradient-free alternative to traditional learning algorithms and have shown encouraging performance on specific tasks in LLMs. In this paper, we perform a more comprehensive analysis of ES and specifically evaluate its forgetting curves when training for a larger number of update steps. We find that although ES is able to reach performance numbers closer to GRPO for math and reasoning tasks, it is accompanied by significant forgetting of prior abilities. We also show that the updates made using ES are much less sparse and have a larger l2 norm compared to corresponding GRPO updates, explaining the contrasting forgetting curves between the two algorithms. With this study, we aim to specifically highlight the issue of forgetting in gradient-free algorithms like ES and hope to inspire future work to mitigate these issues.
%U https://aclanthology.org/2026.acl-short.18/
%P 194-204
Markdown (Informal)
[Evolutionary Strategies at Scale lead to Catastrophic Forgetting](https://aclanthology.org/2026.acl-short.18/) (Abdi et al., ACL 2026)
ACL
- Immanuel Abdi, Akshat Gupta, Micah Mok, Alex Lu, Nicholas Lee, and Gopala Anumanchipalli. 2026. Evolutionary Strategies at Scale lead to Catastrophic Forgetting. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pages 194–204, San Diego, California, United States. Association for Computational Linguistics.