@inproceedings{liu-etal-2025-video,
title = "Video Compression Commander: Plug-and-Play Inference Acceleration for Video Large Language Models",
author = "Liu, Xuyang and
Wang, Yiyu and
Ma, Junpeng and
Zhang, Linfeng",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.98/",
doi = "10.18653/v1/2025.emnlp-main.98",
pages = "1910--1924",
ISBN = "979-8-89176-332-6",
abstract = "Video large language models (VideoLLM) excel at video understanding, but face efficiency challenges due to the quadratic complexity of abundant visual tokens. Our systematic analysis of token compression methods for VideoLLMs reveals two critical issues: \textbf{(i)} overlooking distinctive visual signals across frames, leading to information loss; \textbf{(ii)} suffering from implementation constraints, causing incompatibility with modern architectures or efficient operators.To address these challenges, we distill three design principles for VideoLLM token compression and propose a plug-and-play inference acceleration framework ``\textbf{Vid}eo \textbf{Com}pression \textbf{Com}mander'' (\textbf{VidCom$^2$}). By quantifying each frame{'}s uniqueness, VidCom$^2$ adaptively adjusts compression intensity across frames, effectively preserving essential information while reducing redundancy in video sequences. Extensive experiments across various VideoLLMs and benchmarks demonstrate the superior performance and efficiency of our VidCom$^2$. With only \textbf{25{\%}} visual tokens, VidCom$^2$ achieves \textbf{99.6{\%}} of the original performance on LLaVA-OV while reducing \textbf{70.8{\%}} of the LLM generation latency. Notably, our Frame Compression Adjustment strategy is compatible with other token compression methods to further improve their performance. Our code is available at \url{https://github.com/xuyang-liu16/VidCom2}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2025-video">
<titleInfo>
<title>Video Compression Commander: Plug-and-Play Inference Acceleration for Video Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xuyang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yiyu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junpeng</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linfeng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Video large language models (VideoLLM) excel at video understanding, but face efficiency challenges due to the quadratic complexity of abundant visual tokens. Our systematic analysis of token compression methods for VideoLLMs reveals two critical issues: (i) overlooking distinctive visual signals across frames, leading to information loss; (ii) suffering from implementation constraints, causing incompatibility with modern architectures or efficient operators.To address these challenges, we distill three design principles for VideoLLM token compression and propose a plug-and-play inference acceleration framework “Video Compression Commander” (VidCom²). By quantifying each frame’s uniqueness, VidCom² adaptively adjusts compression intensity across frames, effectively preserving essential information while reducing redundancy in video sequences. Extensive experiments across various VideoLLMs and benchmarks demonstrate the superior performance and efficiency of our VidCom². With only 25% visual tokens, VidCom² achieves 99.6% of the original performance on LLaVA-OV while reducing 70.8% of the LLM generation latency. Notably, our Frame Compression Adjustment strategy is compatible with other token compression methods to further improve their performance. Our code is available at https://github.com/xuyang-liu16/VidCom2.</abstract>
<identifier type="citekey">liu-etal-2025-video</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.98</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.98/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>1910</start>
<end>1924</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Video Compression Commander: Plug-and-Play Inference Acceleration for Video Large Language Models
%A Liu, Xuyang
%A Wang, Yiyu
%A Ma, Junpeng
%A Zhang, Linfeng
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F liu-etal-2025-video
%X Video large language models (VideoLLM) excel at video understanding, but face efficiency challenges due to the quadratic complexity of abundant visual tokens. Our systematic analysis of token compression methods for VideoLLMs reveals two critical issues: (i) overlooking distinctive visual signals across frames, leading to information loss; (ii) suffering from implementation constraints, causing incompatibility with modern architectures or efficient operators.To address these challenges, we distill three design principles for VideoLLM token compression and propose a plug-and-play inference acceleration framework “Video Compression Commander” (VidCom²). By quantifying each frame’s uniqueness, VidCom² adaptively adjusts compression intensity across frames, effectively preserving essential information while reducing redundancy in video sequences. Extensive experiments across various VideoLLMs and benchmarks demonstrate the superior performance and efficiency of our VidCom². With only 25% visual tokens, VidCom² achieves 99.6% of the original performance on LLaVA-OV while reducing 70.8% of the LLM generation latency. Notably, our Frame Compression Adjustment strategy is compatible with other token compression methods to further improve their performance. Our code is available at https://github.com/xuyang-liu16/VidCom2.
%R 10.18653/v1/2025.emnlp-main.98
%U https://aclanthology.org/2025.emnlp-main.98/
%U https://doi.org/10.18653/v1/2025.emnlp-main.98
%P 1910-1924
Markdown (Informal)
[Video Compression Commander: Plug-and-Play Inference Acceleration for Video Large Language Models](https://aclanthology.org/2025.emnlp-main.98/) (Liu et al., EMNLP 2025)
ACL