@inproceedings{wang-etal-2025-prolongvid,
title = "{P}ro{L}ong{V}id: A Simple but Strong Baseline for Long-context Video Instruction Tuning",
author = "Wang, Rui and
Li, Bohao and
Dai, Xiyang and
Yang, Jianwei and
Chen, Yi-Ling and
Xing, Zhen and
Yang, Yifan and
Chen, Dongdong and
Qiu, Xipeng and
Wu, Zuxuan and
Jiang, Yu-Gang",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1570/",
doi = "10.18653/v1/2025.emnlp-main.1570",
pages = "30836--30849",
ISBN = "979-8-89176-332-6",
abstract = "Video understanding is essential for multimodal large language models (MLLMs) to interact effectively with users and the real world. However, analyzing long videos remains a major challenge due to the lack of high-quality video instruction data and effective training strategies. In this paper, we introduce a simple yet effective baseline for long-context video understanding, including dataset construction and training recipes. We curate a large-scale video instruction dataset with over 1M samples, encompassing videos from a few seconds to several minutes across diverse sources, without any human annotations. Additionally, we propose a progressive video instruction tuning strategy that incrementally increases input context length, enabling better utilization of videos of varying durations. Comprehensive experiments demonstrate that our dataset significantly outperforms existing video instruction datasets for fine-tuning MLLMs. Furthermore, our training approach establishes a strong video MLLM baseline, surpassing previous open-source models on video benchmarks and outperforming proprietary models like GPT-4V and GPT-4o-mini on VideoMME, even with a compact 7B model."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2025-prolongvid">
<titleInfo>
<title>ProLongVid: A Simple but Strong Baseline for Long-context Video Instruction Tuning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bohao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiyang</namePart>
<namePart type="family">Dai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianwei</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi-Ling</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhen</namePart>
<namePart type="family">Xing</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yifan</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongdong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xipeng</namePart>
<namePart type="family">Qiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zuxuan</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu-Gang</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Video understanding is essential for multimodal large language models (MLLMs) to interact effectively with users and the real world. However, analyzing long videos remains a major challenge due to the lack of high-quality video instruction data and effective training strategies. In this paper, we introduce a simple yet effective baseline for long-context video understanding, including dataset construction and training recipes. We curate a large-scale video instruction dataset with over 1M samples, encompassing videos from a few seconds to several minutes across diverse sources, without any human annotations. Additionally, we propose a progressive video instruction tuning strategy that incrementally increases input context length, enabling better utilization of videos of varying durations. Comprehensive experiments demonstrate that our dataset significantly outperforms existing video instruction datasets for fine-tuning MLLMs. Furthermore, our training approach establishes a strong video MLLM baseline, surpassing previous open-source models on video benchmarks and outperforming proprietary models like GPT-4V and GPT-4o-mini on VideoMME, even with a compact 7B model.</abstract>
<identifier type="citekey">wang-etal-2025-prolongvid</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.1570</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1570/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>30836</start>
<end>30849</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ProLongVid: A Simple but Strong Baseline for Long-context Video Instruction Tuning
%A Wang, Rui
%A Li, Bohao
%A Dai, Xiyang
%A Yang, Jianwei
%A Chen, Yi-Ling
%A Xing, Zhen
%A Yang, Yifan
%A Chen, Dongdong
%A Qiu, Xipeng
%A Wu, Zuxuan
%A Jiang, Yu-Gang
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F wang-etal-2025-prolongvid
%X Video understanding is essential for multimodal large language models (MLLMs) to interact effectively with users and the real world. However, analyzing long videos remains a major challenge due to the lack of high-quality video instruction data and effective training strategies. In this paper, we introduce a simple yet effective baseline for long-context video understanding, including dataset construction and training recipes. We curate a large-scale video instruction dataset with over 1M samples, encompassing videos from a few seconds to several minutes across diverse sources, without any human annotations. Additionally, we propose a progressive video instruction tuning strategy that incrementally increases input context length, enabling better utilization of videos of varying durations. Comprehensive experiments demonstrate that our dataset significantly outperforms existing video instruction datasets for fine-tuning MLLMs. Furthermore, our training approach establishes a strong video MLLM baseline, surpassing previous open-source models on video benchmarks and outperforming proprietary models like GPT-4V and GPT-4o-mini on VideoMME, even with a compact 7B model.
%R 10.18653/v1/2025.emnlp-main.1570
%U https://aclanthology.org/2025.emnlp-main.1570/
%U https://doi.org/10.18653/v1/2025.emnlp-main.1570
%P 30836-30849
Markdown (Informal)
[ProLongVid: A Simple but Strong Baseline for Long-context Video Instruction Tuning](https://aclanthology.org/2025.emnlp-main.1570/) (Wang et al., EMNLP 2025)
ACL
- Rui Wang, Bohao Li, Xiyang Dai, Jianwei Yang, Yi-Ling Chen, Zhen Xing, Yifan Yang, Dongdong Chen, Xipeng Qiu, Zuxuan Wu, and Yu-Gang Jiang. 2025. ProLongVid: A Simple but Strong Baseline for Long-context Video Instruction Tuning. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 30836–30849, Suzhou, China. Association for Computational Linguistics.