@inproceedings{zhang-etal-2024-simple,
title = "A Simple {LLM} Framework for Long-Range Video Question-Answering",
author = "Zhang, Ce and
Lu, Taixi and
Islam, Md Mohaiminul and
Wang, Ziyang and
Yu, Shoubin and
Bansal, Mohit and
Bertasius, Gedas",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1209",
doi = "10.18653/v1/2024.emnlp-main.1209",
pages = "21715--21737",
abstract = "We present LLoVi, a simple yet effective **L**anguage-based **Lo**ng-range **Vi**deo question-answering (LVQA) framework. Our method decomposes the short- and long-range modeling aspects of LVQA into two stages. First, we use a short-term visual captioner to generate textual descriptions of short video clips (0.5-8 seconds in length) densely sampled from a long input video. Afterward, an LLM aggregates the densely extracted short-term captions to answer a given question. Furthermore, we propose a novel multi-round summarization prompt that asks the LLM first to summarize the noisy short-term visual captions and then answer a given input question. To analyze what makes our simple framework so effective, we thoroughly evaluate various components of our framework. Our empirical analysis reveals that the choice of the visual captioner and LLM is critical for good LVQA performance. The proposed multi-round summarization prompt also leads to a significant LVQA performance boost. Our method achieves the best-reported results on the EgoSchema dataset, best known for very long-form video question-answering. LLoVi also outperforms the previous state-of-the-art by **10.2{\%}** and **6.2{\%}** on NExT-QA and IntentQA for LVQA. Finally, we extend LLoVi to grounded VideoQA, which requires both QA and temporal localization, and show that it outperforms all prior methods on NExT-GQA. Code is available at https://github.com/CeeZh/LLoVi.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2024-simple">
<titleInfo>
<title>A Simple LLM Framework for Long-Range Video Question-Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ce</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Taixi</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="given">Mohaiminul</namePart>
<namePart type="family">Islam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziyang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shoubin</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gedas</namePart>
<namePart type="family">Bertasius</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present LLoVi, a simple yet effective **L**anguage-based **Lo**ng-range **Vi**deo question-answering (LVQA) framework. Our method decomposes the short- and long-range modeling aspects of LVQA into two stages. First, we use a short-term visual captioner to generate textual descriptions of short video clips (0.5-8 seconds in length) densely sampled from a long input video. Afterward, an LLM aggregates the densely extracted short-term captions to answer a given question. Furthermore, we propose a novel multi-round summarization prompt that asks the LLM first to summarize the noisy short-term visual captions and then answer a given input question. To analyze what makes our simple framework so effective, we thoroughly evaluate various components of our framework. Our empirical analysis reveals that the choice of the visual captioner and LLM is critical for good LVQA performance. The proposed multi-round summarization prompt also leads to a significant LVQA performance boost. Our method achieves the best-reported results on the EgoSchema dataset, best known for very long-form video question-answering. LLoVi also outperforms the previous state-of-the-art by **10.2%** and **6.2%** on NExT-QA and IntentQA for LVQA. Finally, we extend LLoVi to grounded VideoQA, which requires both QA and temporal localization, and show that it outperforms all prior methods on NExT-GQA. Code is available at https://github.com/CeeZh/LLoVi.</abstract>
<identifier type="citekey">zhang-etal-2024-simple</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.1209</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1209</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>21715</start>
<end>21737</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Simple LLM Framework for Long-Range Video Question-Answering
%A Zhang, Ce
%A Lu, Taixi
%A Islam, Md Mohaiminul
%A Wang, Ziyang
%A Yu, Shoubin
%A Bansal, Mohit
%A Bertasius, Gedas
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F zhang-etal-2024-simple
%X We present LLoVi, a simple yet effective **L**anguage-based **Lo**ng-range **Vi**deo question-answering (LVQA) framework. Our method decomposes the short- and long-range modeling aspects of LVQA into two stages. First, we use a short-term visual captioner to generate textual descriptions of short video clips (0.5-8 seconds in length) densely sampled from a long input video. Afterward, an LLM aggregates the densely extracted short-term captions to answer a given question. Furthermore, we propose a novel multi-round summarization prompt that asks the LLM first to summarize the noisy short-term visual captions and then answer a given input question. To analyze what makes our simple framework so effective, we thoroughly evaluate various components of our framework. Our empirical analysis reveals that the choice of the visual captioner and LLM is critical for good LVQA performance. The proposed multi-round summarization prompt also leads to a significant LVQA performance boost. Our method achieves the best-reported results on the EgoSchema dataset, best known for very long-form video question-answering. LLoVi also outperforms the previous state-of-the-art by **10.2%** and **6.2%** on NExT-QA and IntentQA for LVQA. Finally, we extend LLoVi to grounded VideoQA, which requires both QA and temporal localization, and show that it outperforms all prior methods on NExT-GQA. Code is available at https://github.com/CeeZh/LLoVi.
%R 10.18653/v1/2024.emnlp-main.1209
%U https://aclanthology.org/2024.emnlp-main.1209
%U https://doi.org/10.18653/v1/2024.emnlp-main.1209
%P 21715-21737
Markdown (Informal)
[A Simple LLM Framework for Long-Range Video Question-Answering](https://aclanthology.org/2024.emnlp-main.1209) (Zhang et al., EMNLP 2024)
ACL
- Ce Zhang, Taixi Lu, Md Mohaiminul Islam, Ziyang Wang, Shoubin Yu, Mohit Bansal, and Gedas Bertasius. 2024. A Simple LLM Framework for Long-Range Video Question-Answering. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 21715–21737, Miami, Florida, USA. Association for Computational Linguistics.