@inproceedings{wu-etal-2025-mitigating-visual,
title = "Mitigating Visual Knowledge Forgetting in {MLLM} Instruction-tuning via Modality-decoupled Gradient Descent",
author = "Wu, Junda and
Xiong, Yuxin and
Li, Xintong and
Xia, Yu and
Wang, Ruoyu and
Wang, Yu and
Yu, Tong and
Kim, Sungchul and
Rossi, Ryan A. and
Yao, Lina and
Shang, Jingbo and
McAuley, Julian",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.123/",
doi = "10.18653/v1/2025.findings-emnlp.123",
pages = "2282--2295",
ISBN = "979-8-89176-335-7",
abstract = "Recent MLLMs have demonstrated strong visual understanding and reasoning after large-scale multimodal pre-training. However, instruction-tuning is typically text-driven with limited visual supervision, leading to significant visual forgetting and degradation of pre-trained visual knowledge. Existing fine-tuning and continual learning methods compress visual representations and emphasize task alignment over visual retention, failing to address this challenge. We present a novel perspective using effective rank to quantify the loss of visual representation richness, framing visual forgetting as excessive compression under the information bottleneck principle. To address this, we propose modality-decoupled gradient descent (MDGD), which regulates gradient updates to preserve the effective rank of visual features and explicitly disentangles visual learning from task-specific alignment. We further introduce a memory-efficient fine-tuning variant using gradient masking for parameter-efficient adaptation. Extensive experiments show that MDGD effectively mitigates visual forgetting across downstream tasks and models, maintaining pre-trained visual knowledge while supporting strong task adaptation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wu-etal-2025-mitigating-visual">
<titleInfo>
<title>Mitigating Visual Knowledge Forgetting in MLLM Instruction-tuning via Modality-decoupled Gradient Descent</title>
</titleInfo>
<name type="personal">
<namePart type="given">Junda</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuxin</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xintong</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruoyu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sungchul</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Rossi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lina</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingbo</namePart>
<namePart type="family">Shang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julian</namePart>
<namePart type="family">McAuley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Recent MLLMs have demonstrated strong visual understanding and reasoning after large-scale multimodal pre-training. However, instruction-tuning is typically text-driven with limited visual supervision, leading to significant visual forgetting and degradation of pre-trained visual knowledge. Existing fine-tuning and continual learning methods compress visual representations and emphasize task alignment over visual retention, failing to address this challenge. We present a novel perspective using effective rank to quantify the loss of visual representation richness, framing visual forgetting as excessive compression under the information bottleneck principle. To address this, we propose modality-decoupled gradient descent (MDGD), which regulates gradient updates to preserve the effective rank of visual features and explicitly disentangles visual learning from task-specific alignment. We further introduce a memory-efficient fine-tuning variant using gradient masking for parameter-efficient adaptation. Extensive experiments show that MDGD effectively mitigates visual forgetting across downstream tasks and models, maintaining pre-trained visual knowledge while supporting strong task adaptation.</abstract>
<identifier type="citekey">wu-etal-2025-mitigating-visual</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.123</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.123/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>2282</start>
<end>2295</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mitigating Visual Knowledge Forgetting in MLLM Instruction-tuning via Modality-decoupled Gradient Descent
%A Wu, Junda
%A Xiong, Yuxin
%A Li, Xintong
%A Xia, Yu
%A Wang, Ruoyu
%A Wang, Yu
%A Yu, Tong
%A Kim, Sungchul
%A Rossi, Ryan A.
%A Yao, Lina
%A Shang, Jingbo
%A McAuley, Julian
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F wu-etal-2025-mitigating-visual
%X Recent MLLMs have demonstrated strong visual understanding and reasoning after large-scale multimodal pre-training. However, instruction-tuning is typically text-driven with limited visual supervision, leading to significant visual forgetting and degradation of pre-trained visual knowledge. Existing fine-tuning and continual learning methods compress visual representations and emphasize task alignment over visual retention, failing to address this challenge. We present a novel perspective using effective rank to quantify the loss of visual representation richness, framing visual forgetting as excessive compression under the information bottleneck principle. To address this, we propose modality-decoupled gradient descent (MDGD), which regulates gradient updates to preserve the effective rank of visual features and explicitly disentangles visual learning from task-specific alignment. We further introduce a memory-efficient fine-tuning variant using gradient masking for parameter-efficient adaptation. Extensive experiments show that MDGD effectively mitigates visual forgetting across downstream tasks and models, maintaining pre-trained visual knowledge while supporting strong task adaptation.
%R 10.18653/v1/2025.findings-emnlp.123
%U https://aclanthology.org/2025.findings-emnlp.123/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.123
%P 2282-2295
Markdown (Informal)
[Mitigating Visual Knowledge Forgetting in MLLM Instruction-tuning via Modality-decoupled Gradient Descent](https://aclanthology.org/2025.findings-emnlp.123/) (Wu et al., Findings 2025)
ACL
- Junda Wu, Yuxin Xiong, Xintong Li, Yu Xia, Ruoyu Wang, Yu Wang, Tong Yu, Sungchul Kim, Ryan A. Rossi, Lina Yao, Jingbo Shang, and Julian McAuley. 2025. Mitigating Visual Knowledge Forgetting in MLLM Instruction-tuning via Modality-decoupled Gradient Descent. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 2282–2295, Suzhou, China. Association for Computational Linguistics.