@inproceedings{chen-etal-2026-preserving,
title = "Preserving Language Capabilities in Vision-Language Models via Representation Regulation",
author = "Chen, ZiXuan and
Tao, Juncheng and
Zeng, Ziqian",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1210/",
doi = "10.18653/v1/2026.findings-acl.1210",
pages = "24189--24205",
ISBN = "979-8-89176-395-1",
abstract = "Vision-Language Models (VLMs) provide a unified framework to process both text-only tasks and vision-language tasks. However, finetuning VLMs on vision-language data has degraded language capabilities. In this paper, we prove that as the training loss declines during finetuning, the visual representation and textual representation move closer to each other, a phenomenon we term ``representation mixing.'' We prove that the representation mixing occurring within the post-representation layers causes the degradation of language capabilities. Post-representation layers refer to the first few layers in LLMs that are involved in representation learning. To preserve the language capabilities, we propose the Representation Regulation for VLM Training (RRVLM), which introduces a Representation Distribution Difference (RDD) loss to reduce the distance between these representations. Extensive experiments on various benchmarks and VLM frameworks show that our method can effectively preserve the language capabilities and achieve superior vision-language performance."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2026-preserving">
<titleInfo>
<title>Preserving Language Capabilities in Vision-Language Models via Representation Regulation</title>
</titleInfo>
<name type="personal">
<namePart type="given">ZiXuan</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juncheng</namePart>
<namePart type="family">Tao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziqian</namePart>
<namePart type="family">Zeng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Vision-Language Models (VLMs) provide a unified framework to process both text-only tasks and vision-language tasks. However, finetuning VLMs on vision-language data has degraded language capabilities. In this paper, we prove that as the training loss declines during finetuning, the visual representation and textual representation move closer to each other, a phenomenon we term “representation mixing.” We prove that the representation mixing occurring within the post-representation layers causes the degradation of language capabilities. Post-representation layers refer to the first few layers in LLMs that are involved in representation learning. To preserve the language capabilities, we propose the Representation Regulation for VLM Training (RRVLM), which introduces a Representation Distribution Difference (RDD) loss to reduce the distance between these representations. Extensive experiments on various benchmarks and VLM frameworks show that our method can effectively preserve the language capabilities and achieve superior vision-language performance.</abstract>
<identifier type="citekey">chen-etal-2026-preserving</identifier>
<identifier type="doi">10.18653/v1/2026.findings-acl.1210</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1210/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>24189</start>
<end>24205</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Preserving Language Capabilities in Vision-Language Models via Representation Regulation
%A Chen, ZiXuan
%A Tao, Juncheng
%A Zeng, Ziqian
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F chen-etal-2026-preserving
%X Vision-Language Models (VLMs) provide a unified framework to process both text-only tasks and vision-language tasks. However, finetuning VLMs on vision-language data has degraded language capabilities. In this paper, we prove that as the training loss declines during finetuning, the visual representation and textual representation move closer to each other, a phenomenon we term “representation mixing.” We prove that the representation mixing occurring within the post-representation layers causes the degradation of language capabilities. Post-representation layers refer to the first few layers in LLMs that are involved in representation learning. To preserve the language capabilities, we propose the Representation Regulation for VLM Training (RRVLM), which introduces a Representation Distribution Difference (RDD) loss to reduce the distance between these representations. Extensive experiments on various benchmarks and VLM frameworks show that our method can effectively preserve the language capabilities and achieve superior vision-language performance.
%R 10.18653/v1/2026.findings-acl.1210
%U https://aclanthology.org/2026.findings-acl.1210/
%U https://doi.org/10.18653/v1/2026.findings-acl.1210
%P 24189-24205
Markdown (Informal)
[Preserving Language Capabilities in Vision-Language Models via Representation Regulation](https://aclanthology.org/2026.findings-acl.1210/) (Chen et al., Findings 2026)
ACL