@inproceedings{pennec-etal-2025-integrating,
title = "Integrating Video and Text: A Balanced Approach to Multimodal Summary Generation and Evaluation",
author = "Pennec, Galann and
Liu, Zhengyuan and
Asher, Nicholas and
Muller, Philippe and
Chen, Nancy F.",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://aclanthology.org/2025.ijcnlp-long.129/",
pages = "2403--2426",
ISBN = "979-8-89176-298-5",
abstract = "Vision-Language Models (VLMs) often struggle to balance visual and textual information when summarizing complex multimodal inputs, such as entire TV show episodes. In this paper, we propose a zero-shot video-to-text summarization approach that builds its own screenplay-like representation of an episode, effectively integrating key video moments, dialogue, and character information into a unified document. Unlike previous approaches, we simultaneously generate screenplays and name the characters in zero-shot, using only the audio, video, and transcripts as input. Additionally, we highlight that existing summarization metrics can fail to assess the multimodal content in summaries. To address this, we introduce MFactSum, a multimodal metric that evaluates summaries with respect to both vision and text modalities. Using MFactSum, we evaluate our screenplay summaries on the SummScreen3D dataset, demonstrating superiority against state-of-the-art VLMs such as Gemini 1.5 by generating summaries containing 20{\%} more relevant visual information while requiring 75{\%} less of the video as input."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pennec-etal-2025-integrating">
<titleInfo>
<title>Integrating Video and Text: A Balanced Approach to Multimodal Summary Generation and Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galann</namePart>
<namePart type="family">Pennec</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhengyuan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Asher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Muller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nancy</namePart>
<namePart type="given">F</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haofen</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Derek</namePart>
<namePart type="given">F</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Biplab</namePart>
<namePart type="family">Banerjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asif</namePart>
<namePart type="family">Ekbal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhirendra</namePart>
<namePart type="given">Pratap</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The Asian Federation of Natural Language Processing and The Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-298-5</identifier>
</relatedItem>
<abstract>Vision-Language Models (VLMs) often struggle to balance visual and textual information when summarizing complex multimodal inputs, such as entire TV show episodes. In this paper, we propose a zero-shot video-to-text summarization approach that builds its own screenplay-like representation of an episode, effectively integrating key video moments, dialogue, and character information into a unified document. Unlike previous approaches, we simultaneously generate screenplays and name the characters in zero-shot, using only the audio, video, and transcripts as input. Additionally, we highlight that existing summarization metrics can fail to assess the multimodal content in summaries. To address this, we introduce MFactSum, a multimodal metric that evaluates summaries with respect to both vision and text modalities. Using MFactSum, we evaluate our screenplay summaries on the SummScreen3D dataset, demonstrating superiority against state-of-the-art VLMs such as Gemini 1.5 by generating summaries containing 20% more relevant visual information while requiring 75% less of the video as input.</abstract>
<identifier type="citekey">pennec-etal-2025-integrating</identifier>
<location>
<url>https://aclanthology.org/2025.ijcnlp-long.129/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>2403</start>
<end>2426</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Integrating Video and Text: A Balanced Approach to Multimodal Summary Generation and Evaluation
%A Pennec, Galann
%A Liu, Zhengyuan
%A Asher, Nicholas
%A Muller, Philippe
%A Chen, Nancy F.
%Y Inui, Kentaro
%Y Sakti, Sakriani
%Y Wang, Haofen
%Y Wong, Derek F.
%Y Bhattacharyya, Pushpak
%Y Banerjee, Biplab
%Y Ekbal, Asif
%Y Chakraborty, Tanmoy
%Y Singh, Dhirendra Pratap
%S Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics
%D 2025
%8 December
%I The Asian Federation of Natural Language Processing and The Association for Computational Linguistics
%C Mumbai, India
%@ 979-8-89176-298-5
%F pennec-etal-2025-integrating
%X Vision-Language Models (VLMs) often struggle to balance visual and textual information when summarizing complex multimodal inputs, such as entire TV show episodes. In this paper, we propose a zero-shot video-to-text summarization approach that builds its own screenplay-like representation of an episode, effectively integrating key video moments, dialogue, and character information into a unified document. Unlike previous approaches, we simultaneously generate screenplays and name the characters in zero-shot, using only the audio, video, and transcripts as input. Additionally, we highlight that existing summarization metrics can fail to assess the multimodal content in summaries. To address this, we introduce MFactSum, a multimodal metric that evaluates summaries with respect to both vision and text modalities. Using MFactSum, we evaluate our screenplay summaries on the SummScreen3D dataset, demonstrating superiority against state-of-the-art VLMs such as Gemini 1.5 by generating summaries containing 20% more relevant visual information while requiring 75% less of the video as input.
%U https://aclanthology.org/2025.ijcnlp-long.129/
%P 2403-2426
Markdown (Informal)
[Integrating Video and Text: A Balanced Approach to Multimodal Summary Generation and Evaluation](https://aclanthology.org/2025.ijcnlp-long.129/) (Pennec et al., IJCNLP-AACL 2025)
ACL
- Galann Pennec, Zhengyuan Liu, Nicholas Asher, Philippe Muller, and Nancy F. Chen. 2025. Integrating Video and Text: A Balanced Approach to Multimodal Summary Generation and Evaluation. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 2403–2426, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.