@inproceedings{liu-etal-2025-language,
title = "Language Constrained Multimodal Hyper Adapter For Many-to-Many Multimodal Summarization",
author = "Liu, Nayu and
Yao, Fanglong and
Luo, Haoran and
Yang, Yong and
Tang, Chen and
Lv, Bo",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1229/",
doi = "10.18653/v1/2025.acl-long.1229",
pages = "25285--25298",
ISBN = "979-8-89176-251-0",
abstract = "Multimodal summarization (MS) combines text and visuals to generate summaries. Recently, many-to-many multimodal summarization (M3S) garnered interest as it enables a unified model for multilingual and cross-lingual MS. Existing methods have made progress by facilitating the transfer of common multimodal summarization knowledge. While, prior M3S models that fully share parameters neglect the language-specific knowledge learning, where potential interference between languages may limit the flexible adaptation of MS modes across different language combinations and hinder further collaborative improvements in joint M3S training. Based on this observation, we propose Language Constrained Multimodal Hyper Adapter (LCMHA) for M3S. LCMHA integrates language-specific multimodal adapters into multilingual pre-trained backbones via a language constrained hypernetwork, enabling relaxed parameter sharing that enhances language-specific learning while preserving shared MS knowledge learning. In addition, a language-regularized hypernetwork is designed to balance intra- and inter-language learning, generating language-specific adaptation weights and enhancing the retention of distinct language features through the regularization of generated parameters. Experimental results on the M3Sum benchmark show LCMHA{'}s effectiveness and scalability across multiple multilingual pre-trained backbones."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2025-language">
<titleInfo>
<title>Language Constrained Multimodal Hyper Adapter For Many-to-Many Multimodal Summarization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nayu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fanglong</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haoran</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yong</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Lv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Multimodal summarization (MS) combines text and visuals to generate summaries. Recently, many-to-many multimodal summarization (M3S) garnered interest as it enables a unified model for multilingual and cross-lingual MS. Existing methods have made progress by facilitating the transfer of common multimodal summarization knowledge. While, prior M3S models that fully share parameters neglect the language-specific knowledge learning, where potential interference between languages may limit the flexible adaptation of MS modes across different language combinations and hinder further collaborative improvements in joint M3S training. Based on this observation, we propose Language Constrained Multimodal Hyper Adapter (LCMHA) for M3S. LCMHA integrates language-specific multimodal adapters into multilingual pre-trained backbones via a language constrained hypernetwork, enabling relaxed parameter sharing that enhances language-specific learning while preserving shared MS knowledge learning. In addition, a language-regularized hypernetwork is designed to balance intra- and inter-language learning, generating language-specific adaptation weights and enhancing the retention of distinct language features through the regularization of generated parameters. Experimental results on the M3Sum benchmark show LCMHA’s effectiveness and scalability across multiple multilingual pre-trained backbones.</abstract>
<identifier type="citekey">liu-etal-2025-language</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1229</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1229/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>25285</start>
<end>25298</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Language Constrained Multimodal Hyper Adapter For Many-to-Many Multimodal Summarization
%A Liu, Nayu
%A Yao, Fanglong
%A Luo, Haoran
%A Yang, Yong
%A Tang, Chen
%A Lv, Bo
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F liu-etal-2025-language
%X Multimodal summarization (MS) combines text and visuals to generate summaries. Recently, many-to-many multimodal summarization (M3S) garnered interest as it enables a unified model for multilingual and cross-lingual MS. Existing methods have made progress by facilitating the transfer of common multimodal summarization knowledge. While, prior M3S models that fully share parameters neglect the language-specific knowledge learning, where potential interference between languages may limit the flexible adaptation of MS modes across different language combinations and hinder further collaborative improvements in joint M3S training. Based on this observation, we propose Language Constrained Multimodal Hyper Adapter (LCMHA) for M3S. LCMHA integrates language-specific multimodal adapters into multilingual pre-trained backbones via a language constrained hypernetwork, enabling relaxed parameter sharing that enhances language-specific learning while preserving shared MS knowledge learning. In addition, a language-regularized hypernetwork is designed to balance intra- and inter-language learning, generating language-specific adaptation weights and enhancing the retention of distinct language features through the regularization of generated parameters. Experimental results on the M3Sum benchmark show LCMHA’s effectiveness and scalability across multiple multilingual pre-trained backbones.
%R 10.18653/v1/2025.acl-long.1229
%U https://aclanthology.org/2025.acl-long.1229/
%U https://doi.org/10.18653/v1/2025.acl-long.1229
%P 25285-25298
Markdown (Informal)
[Language Constrained Multimodal Hyper Adapter For Many-to-Many Multimodal Summarization](https://aclanthology.org/2025.acl-long.1229/) (Liu et al., ACL 2025)
ACL