@inproceedings{wu-etal-2025-videoqa,
title = "{V}ideo{QA}-{TA}: Temporal-Aware Multi-Modal Video Question Answering",
author = "Wu, Zhixuan and
Cheng, Bo and
Han, Jiale and
Ma, Jiabao and
Zhang, Shuhao and
Chen, Yuli and
Li, Changbo",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.483/",
pages = "7239--7252",
abstract = "Video question answering (VideoQA) has recently gained considerable attention in the field of computer vision, aiming to generate answers rely on both linguistic and visual reasoning. However, existing methods often align visual or textual features directly with large language models, which limits the deep semantic association between modalities and hinders a comprehensive understanding of the interactions within spatial and temporal contexts, ultimately leading to sub-optimal reasoning performance. To address this issue, we propose a novel temporal-aware framework for multi-modal video question answering, dubbed VideoQA-TA, which enhances reasoning ability and accuracy of VideoQA by aligning videos and questions at fine-grained levels. Specifically, an effective Spatial-Temporal Attention mechanism (STA) is designed for video aggregation, transforming video features into spatial and temporal representations while attending to information at different levels. Furthermore, a Temporal Object Injection strategy (TOI) is proposed to align object-level and frame-level information within videos, which further improves the accuracy by injecting explicit temporal information. Experimental results on MSVD-QA, MSRVTT-QA, and ActivityNet-QA datasets demonstrate the superior performance of our proposed method compared with the current SOTAs, meanwhile, visualization analysis further verifies the effectiveness of incorporating temporal information to videos."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wu-etal-2025-videoqa">
<titleInfo>
<title>VideoQA-TA: Temporal-Aware Multi-Modal Video Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhixuan</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiale</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiabao</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuhao</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuli</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Changbo</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Video question answering (VideoQA) has recently gained considerable attention in the field of computer vision, aiming to generate answers rely on both linguistic and visual reasoning. However, existing methods often align visual or textual features directly with large language models, which limits the deep semantic association between modalities and hinders a comprehensive understanding of the interactions within spatial and temporal contexts, ultimately leading to sub-optimal reasoning performance. To address this issue, we propose a novel temporal-aware framework for multi-modal video question answering, dubbed VideoQA-TA, which enhances reasoning ability and accuracy of VideoQA by aligning videos and questions at fine-grained levels. Specifically, an effective Spatial-Temporal Attention mechanism (STA) is designed for video aggregation, transforming video features into spatial and temporal representations while attending to information at different levels. Furthermore, a Temporal Object Injection strategy (TOI) is proposed to align object-level and frame-level information within videos, which further improves the accuracy by injecting explicit temporal information. Experimental results on MSVD-QA, MSRVTT-QA, and ActivityNet-QA datasets demonstrate the superior performance of our proposed method compared with the current SOTAs, meanwhile, visualization analysis further verifies the effectiveness of incorporating temporal information to videos.</abstract>
<identifier type="citekey">wu-etal-2025-videoqa</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.483/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>7239</start>
<end>7252</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T VideoQA-TA: Temporal-Aware Multi-Modal Video Question Answering
%A Wu, Zhixuan
%A Cheng, Bo
%A Han, Jiale
%A Ma, Jiabao
%A Zhang, Shuhao
%A Chen, Yuli
%A Li, Changbo
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F wu-etal-2025-videoqa
%X Video question answering (VideoQA) has recently gained considerable attention in the field of computer vision, aiming to generate answers rely on both linguistic and visual reasoning. However, existing methods often align visual or textual features directly with large language models, which limits the deep semantic association between modalities and hinders a comprehensive understanding of the interactions within spatial and temporal contexts, ultimately leading to sub-optimal reasoning performance. To address this issue, we propose a novel temporal-aware framework for multi-modal video question answering, dubbed VideoQA-TA, which enhances reasoning ability and accuracy of VideoQA by aligning videos and questions at fine-grained levels. Specifically, an effective Spatial-Temporal Attention mechanism (STA) is designed for video aggregation, transforming video features into spatial and temporal representations while attending to information at different levels. Furthermore, a Temporal Object Injection strategy (TOI) is proposed to align object-level and frame-level information within videos, which further improves the accuracy by injecting explicit temporal information. Experimental results on MSVD-QA, MSRVTT-QA, and ActivityNet-QA datasets demonstrate the superior performance of our proposed method compared with the current SOTAs, meanwhile, visualization analysis further verifies the effectiveness of incorporating temporal information to videos.
%U https://aclanthology.org/2025.coling-main.483/
%P 7239-7252
Markdown (Informal)
[VideoQA-TA: Temporal-Aware Multi-Modal Video Question Answering](https://aclanthology.org/2025.coling-main.483/) (Wu et al., COLING 2025)
ACL