@inproceedings{chen-etal-2025-vqaguider,
title = "{VQAG}uider: Guiding Multimodal Large Language Models to Answer Complex Video Questions",
author = "Chen, Yuyan and
Jia, Jiyuan and
Lu, Jiaxin and
Li, Siyue and
Guan, Yu and
Yang, Ming and
Guo, Qingpei",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.385/",
doi = "10.18653/v1/2025.acl-long.385",
pages = "7821--7834",
ISBN = "979-8-89176-251-0",
abstract = "Complex video question-answering (VQA) requires in-depth understanding of video contents including object and action recognition as well as video classification and summarization, which exhibits great potential in emerging applications in education and entertainment, etc. Multimodal large language models (MLLMs) may accomplish this task by grasping the intention of a question and decomposing it to a series of visual recognition sub-tasks to find out the answer with the help of an agent. To tackle this task, we first collect a new dedicated Complex VQA dataset named CVQA and then propose VQAGuider, an innovative framework planning a few atomic visual recognition tools by video-related API matching. VQAGuider facilitates a deep engagement with video content and precise responses to complex video-related questions by MLLMs, which is beyond aligning visual and language features for simple VQA tasks. Our experiments demonstrate VQAGuider is capable of navigating the complex VQA tasks by MLLMs and improves the accuracy by 29.6{\%} and 17.2{\%} on CVQA and the existing VQA datasets, respectively, highlighting its potential in advancing MLLMs{'}s capabilities in video understanding."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2025-vqaguider">
<titleInfo>
<title>VQAGuider: Guiding Multimodal Large Language Models to Answer Complex Video Questions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuyan</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiyuan</namePart>
<namePart type="family">Jia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaxin</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siyue</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Guan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qingpei</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Complex video question-answering (VQA) requires in-depth understanding of video contents including object and action recognition as well as video classification and summarization, which exhibits great potential in emerging applications in education and entertainment, etc. Multimodal large language models (MLLMs) may accomplish this task by grasping the intention of a question and decomposing it to a series of visual recognition sub-tasks to find out the answer with the help of an agent. To tackle this task, we first collect a new dedicated Complex VQA dataset named CVQA and then propose VQAGuider, an innovative framework planning a few atomic visual recognition tools by video-related API matching. VQAGuider facilitates a deep engagement with video content and precise responses to complex video-related questions by MLLMs, which is beyond aligning visual and language features for simple VQA tasks. Our experiments demonstrate VQAGuider is capable of navigating the complex VQA tasks by MLLMs and improves the accuracy by 29.6% and 17.2% on CVQA and the existing VQA datasets, respectively, highlighting its potential in advancing MLLMs’s capabilities in video understanding.</abstract>
<identifier type="citekey">chen-etal-2025-vqaguider</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.385</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.385/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>7821</start>
<end>7834</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T VQAGuider: Guiding Multimodal Large Language Models to Answer Complex Video Questions
%A Chen, Yuyan
%A Jia, Jiyuan
%A Lu, Jiaxin
%A Li, Siyue
%A Guan, Yu
%A Yang, Ming
%A Guo, Qingpei
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F chen-etal-2025-vqaguider
%X Complex video question-answering (VQA) requires in-depth understanding of video contents including object and action recognition as well as video classification and summarization, which exhibits great potential in emerging applications in education and entertainment, etc. Multimodal large language models (MLLMs) may accomplish this task by grasping the intention of a question and decomposing it to a series of visual recognition sub-tasks to find out the answer with the help of an agent. To tackle this task, we first collect a new dedicated Complex VQA dataset named CVQA and then propose VQAGuider, an innovative framework planning a few atomic visual recognition tools by video-related API matching. VQAGuider facilitates a deep engagement with video content and precise responses to complex video-related questions by MLLMs, which is beyond aligning visual and language features for simple VQA tasks. Our experiments demonstrate VQAGuider is capable of navigating the complex VQA tasks by MLLMs and improves the accuracy by 29.6% and 17.2% on CVQA and the existing VQA datasets, respectively, highlighting its potential in advancing MLLMs’s capabilities in video understanding.
%R 10.18653/v1/2025.acl-long.385
%U https://aclanthology.org/2025.acl-long.385/
%U https://doi.org/10.18653/v1/2025.acl-long.385
%P 7821-7834
Markdown (Informal)
[VQAGuider: Guiding Multimodal Large Language Models to Answer Complex Video Questions](https://aclanthology.org/2025.acl-long.385/) (Chen et al., ACL 2025)
ACL