@inproceedings{qu-etal-2025-uq,
title = "$\texttt{UQ-Merge}$: Uncertainty Guided Multimodal Large Language Model Merging",
author = "Qu, Huaizhi and
Zhao, Xinyu and
Peng, Jie and
Lee, Kwonjoon and
Dariush, Behzad and
Chen, Tianlong",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.73/",
doi = "10.18653/v1/2025.findings-acl.73",
pages = "1401--1417",
ISBN = "979-8-89176-256-5",
abstract = "Multimodal Large Language Models (MLLMs) have gained increasing popularity as a promising framework for leveraging the strong language reasoning capabilities in the vision-language domain. Given a wide range of MLLMs, model merging potentially offers a cheap way to aggregate their diverse knowledge into a single MLLM. However, directly plug-in existing model merging approaches often leads to suboptimal performance due to (1) inclusion of harmful models that have over-confident predictions in the target task; (2) the lack of specialized designs for vision-language inputs. To tackle these pain points, we conduct pioneering investigations to dissect the merging procedures and propose an uncertainty-guided MLLM merging algorithm, $\textit{i.e.}$, $\texttt{UQ-Merge}$, which $i$) identifies beneficial candidates for merging, $ii$) determines the merging order and the number of helpful candidates, and $iii$) performs appropriate merging. Within our framework, we consider uncertainty quantification on both text and vision inputs to examine the MLLM prediction confidence, and then decide whether and when a MLLM needs to be included. It is worth mentioning that our vision-language uncertainty quantification does not require access to sample labels, making it more practical in various scenarios. Extensive experiments consistently demonstrate the superior MLLM merging performance of $\texttt{UQ-Merge}$ in both held-in and held-out vision-language benchmarks. For example, compared to existing state-of-the-art merging methods, $\texttt{UQ-Merge}$ brings substantial performance improvements of up to 44.3{\%} on average accuracy in 12 datasets. Codes are available at https://anonymous.4open.science/r/UQ-Merge-7CD7."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="qu-etal-2025-uq">
<titleInfo>
<title>UQ-Merge: Uncertainty Guided Multimodal Large Language Model Merging</title>
</titleInfo>
<name type="personal">
<namePart type="given">Huaizhi</namePart>
<namePart type="family">Qu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinyu</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jie</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kwonjoon</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Behzad</namePart>
<namePart type="family">Dariush</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianlong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Multimodal Large Language Models (MLLMs) have gained increasing popularity as a promising framework for leveraging the strong language reasoning capabilities in the vision-language domain. Given a wide range of MLLMs, model merging potentially offers a cheap way to aggregate their diverse knowledge into a single MLLM. However, directly plug-in existing model merging approaches often leads to suboptimal performance due to (1) inclusion of harmful models that have over-confident predictions in the target task; (2) the lack of specialized designs for vision-language inputs. To tackle these pain points, we conduct pioneering investigations to dissect the merging procedures and propose an uncertainty-guided MLLM merging algorithm, i.e., UQ-Merge, which i) identifies beneficial candidates for merging, ii) determines the merging order and the number of helpful candidates, and iii) performs appropriate merging. Within our framework, we consider uncertainty quantification on both text and vision inputs to examine the MLLM prediction confidence, and then decide whether and when a MLLM needs to be included. It is worth mentioning that our vision-language uncertainty quantification does not require access to sample labels, making it more practical in various scenarios. Extensive experiments consistently demonstrate the superior MLLM merging performance of UQ-Merge in both held-in and held-out vision-language benchmarks. For example, compared to existing state-of-the-art merging methods, UQ-Merge brings substantial performance improvements of up to 44.3% on average accuracy in 12 datasets. Codes are available at https://anonymous.4open.science/r/UQ-Merge-7CD7.</abstract>
<identifier type="citekey">qu-etal-2025-uq</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.73</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.73/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>1401</start>
<end>1417</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T UQ-Merge: Uncertainty Guided Multimodal Large Language Model Merging
%A Qu, Huaizhi
%A Zhao, Xinyu
%A Peng, Jie
%A Lee, Kwonjoon
%A Dariush, Behzad
%A Chen, Tianlong
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F qu-etal-2025-uq
%X Multimodal Large Language Models (MLLMs) have gained increasing popularity as a promising framework for leveraging the strong language reasoning capabilities in the vision-language domain. Given a wide range of MLLMs, model merging potentially offers a cheap way to aggregate their diverse knowledge into a single MLLM. However, directly plug-in existing model merging approaches often leads to suboptimal performance due to (1) inclusion of harmful models that have over-confident predictions in the target task; (2) the lack of specialized designs for vision-language inputs. To tackle these pain points, we conduct pioneering investigations to dissect the merging procedures and propose an uncertainty-guided MLLM merging algorithm, i.e., UQ-Merge, which i) identifies beneficial candidates for merging, ii) determines the merging order and the number of helpful candidates, and iii) performs appropriate merging. Within our framework, we consider uncertainty quantification on both text and vision inputs to examine the MLLM prediction confidence, and then decide whether and when a MLLM needs to be included. It is worth mentioning that our vision-language uncertainty quantification does not require access to sample labels, making it more practical in various scenarios. Extensive experiments consistently demonstrate the superior MLLM merging performance of UQ-Merge in both held-in and held-out vision-language benchmarks. For example, compared to existing state-of-the-art merging methods, UQ-Merge brings substantial performance improvements of up to 44.3% on average accuracy in 12 datasets. Codes are available at https://anonymous.4open.science/r/UQ-Merge-7CD7.
%R 10.18653/v1/2025.findings-acl.73
%U https://aclanthology.org/2025.findings-acl.73/
%U https://doi.org/10.18653/v1/2025.findings-acl.73
%P 1401-1417
Markdown (Informal)
[UQ-Merge: Uncertainty Guided Multimodal Large Language Model Merging](https://aclanthology.org/2025.findings-acl.73/) (Qu et al., Findings 2025)
ACL