@inproceedings{zhang-2025-bridging,
title = "Bridging Multimodal and Video Summarization: A Unified Survey",
author = "Zhang, Haopeng",
editor = "Dong, Yue and
Xiao, Wen and
Zhang, Haopeng and
Zhang, Rui and
Ernst, Ori and
Wang, Lu and
Liu, Fei",
booktitle = "Proceedings of The 5th New Frontiers in Summarization Workshop",
month = nov,
year = "2025",
address = "Hybrid",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.newsum-main.11/",
pages = "157--171",
ISBN = "979-8-89176-337-1",
abstract = "Multimodal summarization (MMS) and video summarization (VS) have traditionally evolved in separate communities{---}natural language processing (NLP) and computer vision (CV), respectively. MMS focuses on generating textual summaries from inputs such as text, images, or audio, while VS emphasizes selecting key visual content. With the recent rise of vision-language models (VLMs), these once-disparate tasks are converging under a unified framework that integrates visual and linguistic understanding.In this survey, we provide a unified perspective that bridges MMS and VS. We formalize the task landscape, review key datasets and evaluation metrics, and categorize major modeling approaches into new taxonomy. In addition, we highlight core challenges and outline future directions toward building general-purpose multimodal summarization systems. By synthesizing insights from both NLP and CV communities, this survey aims to establish a coherent foundation for advancing this rapidly evolving field."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-2025-bridging">
<titleInfo>
<title>Bridging Multimodal and Video Summarization: A Unified Survey</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haopeng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of The 5th New Frontiers in Summarization Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wen</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haopeng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ori</namePart>
<namePart type="family">Ernst</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hybrid</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-337-1</identifier>
</relatedItem>
<abstract>Multimodal summarization (MMS) and video summarization (VS) have traditionally evolved in separate communities—natural language processing (NLP) and computer vision (CV), respectively. MMS focuses on generating textual summaries from inputs such as text, images, or audio, while VS emphasizes selecting key visual content. With the recent rise of vision-language models (VLMs), these once-disparate tasks are converging under a unified framework that integrates visual and linguistic understanding.In this survey, we provide a unified perspective that bridges MMS and VS. We formalize the task landscape, review key datasets and evaluation metrics, and categorize major modeling approaches into new taxonomy. In addition, we highlight core challenges and outline future directions toward building general-purpose multimodal summarization systems. By synthesizing insights from both NLP and CV communities, this survey aims to establish a coherent foundation for advancing this rapidly evolving field.</abstract>
<identifier type="citekey">zhang-2025-bridging</identifier>
<location>
<url>https://aclanthology.org/2025.newsum-main.11/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>157</start>
<end>171</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bridging Multimodal and Video Summarization: A Unified Survey
%A Zhang, Haopeng
%Y Dong, Yue
%Y Xiao, Wen
%Y Zhang, Haopeng
%Y Zhang, Rui
%Y Ernst, Ori
%Y Wang, Lu
%Y Liu, Fei
%S Proceedings of The 5th New Frontiers in Summarization Workshop
%D 2025
%8 November
%I Association for Computational Linguistics
%C Hybrid
%@ 979-8-89176-337-1
%F zhang-2025-bridging
%X Multimodal summarization (MMS) and video summarization (VS) have traditionally evolved in separate communities—natural language processing (NLP) and computer vision (CV), respectively. MMS focuses on generating textual summaries from inputs such as text, images, or audio, while VS emphasizes selecting key visual content. With the recent rise of vision-language models (VLMs), these once-disparate tasks are converging under a unified framework that integrates visual and linguistic understanding.In this survey, we provide a unified perspective that bridges MMS and VS. We formalize the task landscape, review key datasets and evaluation metrics, and categorize major modeling approaches into new taxonomy. In addition, we highlight core challenges and outline future directions toward building general-purpose multimodal summarization systems. By synthesizing insights from both NLP and CV communities, this survey aims to establish a coherent foundation for advancing this rapidly evolving field.
%U https://aclanthology.org/2025.newsum-main.11/
%P 157-171
Markdown (Informal)
[Bridging Multimodal and Video Summarization: A Unified Survey](https://aclanthology.org/2025.newsum-main.11/) (Zhang, NewSum 2025)
ACL