@inproceedings{yang-etal-2025-cliperase,
title = "{CLIPE}rase: Efficient Unlearning of Visual-Textual Associations in {CLIP}",
author = "Yang, Tianyu and
Dai, Lisen and
Wang, Xiangqi and
Cheng, Minhao and
Tian, Yapeng and
Zhang, Xiangliang",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1469/",
doi = "10.18653/v1/2025.acl-long.1469",
pages = "30438--30452",
ISBN = "979-8-89176-251-0",
abstract = "Machine unlearning (MU) has gained significant attention as a means to remove the influence of specific data from a trained model without requiring full retraining. While progress has been made in unimodal domains like text and image classification, unlearning in multimodal models remains relatively under-explored. In this work, we address the unique challenges of unlearning in CLIP, a prominent multimodal model that aligns visual and textual representations. We introduce CLIPErase, a novel approach that disentangles and selectively forgets both visual and textual associations, ensuring that unlearning does not compromise model performance.CLIPErase consists of three key modules: a Forgetting Module that disrupts the associations in the forget set, a Retention Module that preserves performance on the retain set, and a Consistency Module that maintains consistency with the original model. Extensive experiments on CIFAR-100, Flickr30K, and Conceptual 12M across five CLIP downstream tasks, as well as an evaluation on diffusion models, demonstrate that CLIPErase effectively removes designated associations from multimodal samples in downstream tasks, while preserving the model{'}s performance on the retain set after unlearning."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yang-etal-2025-cliperase">
<titleInfo>
<title>CLIPErase: Efficient Unlearning of Visual-Textual Associations in CLIP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tianyu</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lisen</namePart>
<namePart type="family">Dai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiangqi</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minhao</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yapeng</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiangliang</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Machine unlearning (MU) has gained significant attention as a means to remove the influence of specific data from a trained model without requiring full retraining. While progress has been made in unimodal domains like text and image classification, unlearning in multimodal models remains relatively under-explored. In this work, we address the unique challenges of unlearning in CLIP, a prominent multimodal model that aligns visual and textual representations. We introduce CLIPErase, a novel approach that disentangles and selectively forgets both visual and textual associations, ensuring that unlearning does not compromise model performance.CLIPErase consists of three key modules: a Forgetting Module that disrupts the associations in the forget set, a Retention Module that preserves performance on the retain set, and a Consistency Module that maintains consistency with the original model. Extensive experiments on CIFAR-100, Flickr30K, and Conceptual 12M across five CLIP downstream tasks, as well as an evaluation on diffusion models, demonstrate that CLIPErase effectively removes designated associations from multimodal samples in downstream tasks, while preserving the model’s performance on the retain set after unlearning.</abstract>
<identifier type="citekey">yang-etal-2025-cliperase</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1469</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1469/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>30438</start>
<end>30452</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CLIPErase: Efficient Unlearning of Visual-Textual Associations in CLIP
%A Yang, Tianyu
%A Dai, Lisen
%A Wang, Xiangqi
%A Cheng, Minhao
%A Tian, Yapeng
%A Zhang, Xiangliang
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F yang-etal-2025-cliperase
%X Machine unlearning (MU) has gained significant attention as a means to remove the influence of specific data from a trained model without requiring full retraining. While progress has been made in unimodal domains like text and image classification, unlearning in multimodal models remains relatively under-explored. In this work, we address the unique challenges of unlearning in CLIP, a prominent multimodal model that aligns visual and textual representations. We introduce CLIPErase, a novel approach that disentangles and selectively forgets both visual and textual associations, ensuring that unlearning does not compromise model performance.CLIPErase consists of three key modules: a Forgetting Module that disrupts the associations in the forget set, a Retention Module that preserves performance on the retain set, and a Consistency Module that maintains consistency with the original model. Extensive experiments on CIFAR-100, Flickr30K, and Conceptual 12M across five CLIP downstream tasks, as well as an evaluation on diffusion models, demonstrate that CLIPErase effectively removes designated associations from multimodal samples in downstream tasks, while preserving the model’s performance on the retain set after unlearning.
%R 10.18653/v1/2025.acl-long.1469
%U https://aclanthology.org/2025.acl-long.1469/
%U https://doi.org/10.18653/v1/2025.acl-long.1469
%P 30438-30452
Markdown (Informal)
[CLIPErase: Efficient Unlearning of Visual-Textual Associations in CLIP](https://aclanthology.org/2025.acl-long.1469/) (Yang et al., ACL 2025)
ACL
- Tianyu Yang, Lisen Dai, Xiangqi Wang, Minhao Cheng, Yapeng Tian, and Xiangliang Zhang. 2025. CLIPErase: Efficient Unlearning of Visual-Textual Associations in CLIP. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 30438–30452, Vienna, Austria. Association for Computational Linguistics.