@inproceedings{dong-etal-2025-undial,
title = "{UNDIAL}: Self-Distillation with Adjusted Logits for Robust Unlearning in Large Language Models",
author = "Dong, Yijiang River and
Lin, Hongzhou and
Belkin, Mikhail and
Huerta, Ramon and
Vuli{\'c}, Ivan",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.444/",
doi = "10.18653/v1/2025.naacl-long.444",
pages = "8827--8840",
ISBN = "979-8-89176-189-6",
abstract = "Mitigating the retention of sensitive or private information in large language models is essential for enhancing privacy and safety. Existing unlearning methods, like Gradient Ascent and Negative Preference Optimization, directly tune models to remove unwanted information. However, these methods often become unstable because they fine-tune by maximizing loss, which is the opposite of traditional loss minimization in learning. This reversal creates instability, especially on larger datasets, as the model struggles to balance unlearning with maintaining language capacity, leading to over-unlearning. In this paper, we introduce UnDIAL (Unlearning via Self-Distillation on Adjusted Logits), a novel and robust unlearning method. Our approach leverages self-distillation to adjust logits and selectively reduce the influence of targeted tokens. This technique ensures smooth convergence and avoids catastrophic forgetting, even in challenging unlearning tasks with large datasets and sequential unlearning requests. Extensive experiments show that UnDIAL is the first direct tuning method to achieve both robustness in unlearning and scalability, while maintaining stable training dynamics and resilience to hyperparameter tuning."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dong-etal-2025-undial">
<titleInfo>
<title>UNDIAL: Self-Distillation with Adjusted Logits for Robust Unlearning in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yijiang</namePart>
<namePart type="given">River</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongzhou</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikhail</namePart>
<namePart type="family">Belkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramon</namePart>
<namePart type="family">Huerta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Vulić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Mitigating the retention of sensitive or private information in large language models is essential for enhancing privacy and safety. Existing unlearning methods, like Gradient Ascent and Negative Preference Optimization, directly tune models to remove unwanted information. However, these methods often become unstable because they fine-tune by maximizing loss, which is the opposite of traditional loss minimization in learning. This reversal creates instability, especially on larger datasets, as the model struggles to balance unlearning with maintaining language capacity, leading to over-unlearning. In this paper, we introduce UnDIAL (Unlearning via Self-Distillation on Adjusted Logits), a novel and robust unlearning method. Our approach leverages self-distillation to adjust logits and selectively reduce the influence of targeted tokens. This technique ensures smooth convergence and avoids catastrophic forgetting, even in challenging unlearning tasks with large datasets and sequential unlearning requests. Extensive experiments show that UnDIAL is the first direct tuning method to achieve both robustness in unlearning and scalability, while maintaining stable training dynamics and resilience to hyperparameter tuning.</abstract>
<identifier type="citekey">dong-etal-2025-undial</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.444</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.444/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>8827</start>
<end>8840</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T UNDIAL: Self-Distillation with Adjusted Logits for Robust Unlearning in Large Language Models
%A Dong, Yijiang River
%A Lin, Hongzhou
%A Belkin, Mikhail
%A Huerta, Ramon
%A Vulić, Ivan
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F dong-etal-2025-undial
%X Mitigating the retention of sensitive or private information in large language models is essential for enhancing privacy and safety. Existing unlearning methods, like Gradient Ascent and Negative Preference Optimization, directly tune models to remove unwanted information. However, these methods often become unstable because they fine-tune by maximizing loss, which is the opposite of traditional loss minimization in learning. This reversal creates instability, especially on larger datasets, as the model struggles to balance unlearning with maintaining language capacity, leading to over-unlearning. In this paper, we introduce UnDIAL (Unlearning via Self-Distillation on Adjusted Logits), a novel and robust unlearning method. Our approach leverages self-distillation to adjust logits and selectively reduce the influence of targeted tokens. This technique ensures smooth convergence and avoids catastrophic forgetting, even in challenging unlearning tasks with large datasets and sequential unlearning requests. Extensive experiments show that UnDIAL is the first direct tuning method to achieve both robustness in unlearning and scalability, while maintaining stable training dynamics and resilience to hyperparameter tuning.
%R 10.18653/v1/2025.naacl-long.444
%U https://aclanthology.org/2025.naacl-long.444/
%U https://doi.org/10.18653/v1/2025.naacl-long.444
%P 8827-8840
Markdown (Informal)
[UNDIAL: Self-Distillation with Adjusted Logits for Robust Unlearning in Large Language Models](https://aclanthology.org/2025.naacl-long.444/) (Dong et al., NAACL 2025)
ACL