@inproceedings{kassem-etal-2025-reviving,
title = "{REVIVING} {YOUR} {MNEME}: Predicting The Side Effects of {LLM} Unlearning and Fine-Tuning via Sparse Model Diffing",
author = "Kassem, Aly M. and
Shi, Zhuan and
Rostamzadeh, Negar and
Farnadi, Golnoosh",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1641/",
pages = "32238--32251",
ISBN = "979-8-89176-332-6",
abstract = "LLMs are frequently fine-tuned or unlearned to adapt to new tasks or eliminate undesirable behaviors. While existing evaluation methods assess performance after such interventions, there remains no general approach for detecting unintended side effects{---}such as unlearning biology content degrading performance on chemistry tasks, particularly when these effects are unpredictable or emergent. To address this issue, we introduce MNEME, $\textit{Model diffiNg for Evaluating Mechanistic Effects}$, a framework for identifying these side effects using sparse model diffing. MNEME compares base and fine-tuned models on out-of-distribution (OOD) data (e.g., The Pile, LMSYS-Chat-1M), without access to fine-tuning data, to isolate behavioral shifts.Applied to five LLMs across three scenarios, WMDP knowledge unlearning, emergent misalignment, and benign fine-tuning, MNEME achieves up to 95{\%} accuracy in predicting side effects, aligning with known benchmarks and requiring no custom heuristics. Our results demonstrate that sparse probing and diffing offer a scalable and automated lens into fine-tuning-induced model changes, providing practical tools for understanding and managing LLM behavior."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kassem-etal-2025-reviving">
<titleInfo>
<title>REVIVING YOUR MNEME: Predicting The Side Effects of LLM Unlearning and Fine-Tuning via Sparse Model Diffing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aly</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Kassem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuan</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Negar</namePart>
<namePart type="family">Rostamzadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Golnoosh</namePart>
<namePart type="family">Farnadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>LLMs are frequently fine-tuned or unlearned to adapt to new tasks or eliminate undesirable behaviors. While existing evaluation methods assess performance after such interventions, there remains no general approach for detecting unintended side effects—such as unlearning biology content degrading performance on chemistry tasks, particularly when these effects are unpredictable or emergent. To address this issue, we introduce MNEME, Model diffiNg for Evaluating Mechanistic Effects, a framework for identifying these side effects using sparse model diffing. MNEME compares base and fine-tuned models on out-of-distribution (OOD) data (e.g., The Pile, LMSYS-Chat-1M), without access to fine-tuning data, to isolate behavioral shifts.Applied to five LLMs across three scenarios, WMDP knowledge unlearning, emergent misalignment, and benign fine-tuning, MNEME achieves up to 95% accuracy in predicting side effects, aligning with known benchmarks and requiring no custom heuristics. Our results demonstrate that sparse probing and diffing offer a scalable and automated lens into fine-tuning-induced model changes, providing practical tools for understanding and managing LLM behavior.</abstract>
<identifier type="citekey">kassem-etal-2025-reviving</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1641/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>32238</start>
<end>32251</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T REVIVING YOUR MNEME: Predicting The Side Effects of LLM Unlearning and Fine-Tuning via Sparse Model Diffing
%A Kassem, Aly M.
%A Shi, Zhuan
%A Rostamzadeh, Negar
%A Farnadi, Golnoosh
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F kassem-etal-2025-reviving
%X LLMs are frequently fine-tuned or unlearned to adapt to new tasks or eliminate undesirable behaviors. While existing evaluation methods assess performance after such interventions, there remains no general approach for detecting unintended side effects—such as unlearning biology content degrading performance on chemistry tasks, particularly when these effects are unpredictable or emergent. To address this issue, we introduce MNEME, Model diffiNg for Evaluating Mechanistic Effects, a framework for identifying these side effects using sparse model diffing. MNEME compares base and fine-tuned models on out-of-distribution (OOD) data (e.g., The Pile, LMSYS-Chat-1M), without access to fine-tuning data, to isolate behavioral shifts.Applied to five LLMs across three scenarios, WMDP knowledge unlearning, emergent misalignment, and benign fine-tuning, MNEME achieves up to 95% accuracy in predicting side effects, aligning with known benchmarks and requiring no custom heuristics. Our results demonstrate that sparse probing and diffing offer a scalable and automated lens into fine-tuning-induced model changes, providing practical tools for understanding and managing LLM behavior.
%U https://aclanthology.org/2025.emnlp-main.1641/
%P 32238-32251
Markdown (Informal)
[REVIVING YOUR MNEME: Predicting The Side Effects of LLM Unlearning and Fine-Tuning via Sparse Model Diffing](https://aclanthology.org/2025.emnlp-main.1641/) (Kassem et al., EMNLP 2025)
ACL