@inproceedings{gu-etal-2026-diagnosing,
title = "Diagnosing Hidden Instabilities in Model Editing via Uncertainty Quantification",
author = "Gu, Zihan and
Zhang, TianYi and
Zhang, Xinyan and
Wang, Zhiyuan and
Zhang, Han and
Wei, Yuhao and
Lu, Jiacheng and
Ma, Tianyi and
Zhang, Xingsheng and
Zhang, Hua and
Hu, Yue",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1502/",
pages = "32544--32566",
ISBN = "979-8-89176-390-6",
abstract = "Model editing provides a promising mechanism for updating large language models (LLMs) without expensive retraining. Existing approaches, particularly locate-and-edit methods based on least-squares optimization, aim to introduce targeted knowledge changes while preserving pre-trained behavior. In this work, we show that this objective is fundamentally fragile under standard single-edit evaluation protocols. We first develop a unified theoretical framework that characterizes activation-based editing as a constrained intervention on intermediate representations. Within this framework, we demonstrate that least-squares edits cannot, in general, isolate target updates from unrelated activations, giving rise to unavoidable interference that accumulates with successive edits. Crucially, this degradation can remain undetected in single-edit settings when assessed using conventional success and locality metrics. To expose such hidden instabilities, we introduce an uncertainty-based evaluation protocol that combines structured semantic perturbations with uncertainty quantification based on Sampling with Perturbation for UQ. By measuring edit-induced growth in aleatoric and epistemic uncertainty, our method reveals local knowledge conflicts that are invisible to existing benchmarks. Extensive experiments across multiple models, datasets, and editing algorithms show that both least-squares and other parameter-update-based methods consistently increase post-edit uncertainty. Together, our results suggest that current evaluation practices substantially overestimate the reliability of single-edit model editing, and that uncertainty-based diagnostics are necessary for assessing edit stability."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gu-etal-2026-diagnosing">
<titleInfo>
<title>Diagnosing Hidden Instabilities in Model Editing via Uncertainty Quantification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zihan</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">TianYi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinyan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyuan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Han</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuhao</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiacheng</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianyi</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xingsheng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hua</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Model editing provides a promising mechanism for updating large language models (LLMs) without expensive retraining. Existing approaches, particularly locate-and-edit methods based on least-squares optimization, aim to introduce targeted knowledge changes while preserving pre-trained behavior. In this work, we show that this objective is fundamentally fragile under standard single-edit evaluation protocols. We first develop a unified theoretical framework that characterizes activation-based editing as a constrained intervention on intermediate representations. Within this framework, we demonstrate that least-squares edits cannot, in general, isolate target updates from unrelated activations, giving rise to unavoidable interference that accumulates with successive edits. Crucially, this degradation can remain undetected in single-edit settings when assessed using conventional success and locality metrics. To expose such hidden instabilities, we introduce an uncertainty-based evaluation protocol that combines structured semantic perturbations with uncertainty quantification based on Sampling with Perturbation for UQ. By measuring edit-induced growth in aleatoric and epistemic uncertainty, our method reveals local knowledge conflicts that are invisible to existing benchmarks. Extensive experiments across multiple models, datasets, and editing algorithms show that both least-squares and other parameter-update-based methods consistently increase post-edit uncertainty. Together, our results suggest that current evaluation practices substantially overestimate the reliability of single-edit model editing, and that uncertainty-based diagnostics are necessary for assessing edit stability.</abstract>
<identifier type="citekey">gu-etal-2026-diagnosing</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1502/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>32544</start>
<end>32566</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Diagnosing Hidden Instabilities in Model Editing via Uncertainty Quantification
%A Gu, Zihan
%A Zhang, TianYi
%A Zhang, Xinyan
%A Wang, Zhiyuan
%A Zhang, Han
%A Wei, Yuhao
%A Lu, Jiacheng
%A Ma, Tianyi
%A Zhang, Xingsheng
%A Zhang, Hua
%A Hu, Yue
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F gu-etal-2026-diagnosing
%X Model editing provides a promising mechanism for updating large language models (LLMs) without expensive retraining. Existing approaches, particularly locate-and-edit methods based on least-squares optimization, aim to introduce targeted knowledge changes while preserving pre-trained behavior. In this work, we show that this objective is fundamentally fragile under standard single-edit evaluation protocols. We first develop a unified theoretical framework that characterizes activation-based editing as a constrained intervention on intermediate representations. Within this framework, we demonstrate that least-squares edits cannot, in general, isolate target updates from unrelated activations, giving rise to unavoidable interference that accumulates with successive edits. Crucially, this degradation can remain undetected in single-edit settings when assessed using conventional success and locality metrics. To expose such hidden instabilities, we introduce an uncertainty-based evaluation protocol that combines structured semantic perturbations with uncertainty quantification based on Sampling with Perturbation for UQ. By measuring edit-induced growth in aleatoric and epistemic uncertainty, our method reveals local knowledge conflicts that are invisible to existing benchmarks. Extensive experiments across multiple models, datasets, and editing algorithms show that both least-squares and other parameter-update-based methods consistently increase post-edit uncertainty. Together, our results suggest that current evaluation practices substantially overestimate the reliability of single-edit model editing, and that uncertainty-based diagnostics are necessary for assessing edit stability.
%U https://aclanthology.org/2026.acl-long.1502/
%P 32544-32566
Markdown (Informal)
[Diagnosing Hidden Instabilities in Model Editing via Uncertainty Quantification](https://aclanthology.org/2026.acl-long.1502/) (Gu et al., ACL 2026)
ACL
- Zihan Gu, TianYi Zhang, Xinyan Zhang, Zhiyuan Wang, Han Zhang, Yuhao Wei, Jiacheng Lu, Tianyi Ma, Xingsheng Zhang, Hua Zhang, and Yue Hu. 2026. Diagnosing Hidden Instabilities in Model Editing via Uncertainty Quantification. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 32544–32566, San Diego, California, United States. Association for Computational Linguistics.