@inproceedings{abdessaied-etal-2022-video,
title = "Video Language Co-Attention with Multimodal Fast-Learning Feature Fusion for {V}ideo{QA}",
author = "Abdessaied, Adnen and
Sood, Ekta and
Bulling, Andreas",
booktitle = "Proceedings of the 7th Workshop on Representation Learning for NLP",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.repl4nlp-1.15",
doi = "10.18653/v1/2022.repl4nlp-1.15",
pages = "143--155",
abstract = "We propose the Video Language Co-Attention Network (VLCN) {--} a novel memory-enhanced model for Video Question Answering (VideoQA). Our model combines two original contributions{''}:'' A multi-modal fast-learning feature fusion (FLF) block and a mechanism that uses self-attended language features to separately guide neural attention on both static and dynamic visual features extracted from individual video frames and short video clips. When trained from scratch, VLCN achieves competitive results with the state of the art on both MSVD-QA and MSRVTT-QA with 38.06{\%} and 36.01{\%} test accuracies, respectively. Through an ablation study, we further show that FLF improves generalization across different VideoQA datasets and performance for question types that are notoriously challenging in current datasets, such as long questions that require deeper reasoning as well as questions with rare answers.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abdessaied-etal-2022-video">
<titleInfo>
<title>Video Language Co-Attention with Multimodal Fast-Learning Feature Fusion for VideoQA</title>
</titleInfo>
<name type="personal">
<namePart type="given">Adnen</namePart>
<namePart type="family">Abdessaied</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekta</namePart>
<namePart type="family">Sood</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Bulling</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Workshop on Representation Learning for NLP</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We propose the Video Language Co-Attention Network (VLCN) – a novel memory-enhanced model for Video Question Answering (VideoQA). Our model combines two original contributions”:” A multi-modal fast-learning feature fusion (FLF) block and a mechanism that uses self-attended language features to separately guide neural attention on both static and dynamic visual features extracted from individual video frames and short video clips. When trained from scratch, VLCN achieves competitive results with the state of the art on both MSVD-QA and MSRVTT-QA with 38.06% and 36.01% test accuracies, respectively. Through an ablation study, we further show that FLF improves generalization across different VideoQA datasets and performance for question types that are notoriously challenging in current datasets, such as long questions that require deeper reasoning as well as questions with rare answers.</abstract>
<identifier type="citekey">abdessaied-etal-2022-video</identifier>
<identifier type="doi">10.18653/v1/2022.repl4nlp-1.15</identifier>
<location>
<url>https://aclanthology.org/2022.repl4nlp-1.15</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>143</start>
<end>155</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Video Language Co-Attention with Multimodal Fast-Learning Feature Fusion for VideoQA
%A Abdessaied, Adnen
%A Sood, Ekta
%A Bulling, Andreas
%S Proceedings of the 7th Workshop on Representation Learning for NLP
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F abdessaied-etal-2022-video
%X We propose the Video Language Co-Attention Network (VLCN) – a novel memory-enhanced model for Video Question Answering (VideoQA). Our model combines two original contributions”:” A multi-modal fast-learning feature fusion (FLF) block and a mechanism that uses self-attended language features to separately guide neural attention on both static and dynamic visual features extracted from individual video frames and short video clips. When trained from scratch, VLCN achieves competitive results with the state of the art on both MSVD-QA and MSRVTT-QA with 38.06% and 36.01% test accuracies, respectively. Through an ablation study, we further show that FLF improves generalization across different VideoQA datasets and performance for question types that are notoriously challenging in current datasets, such as long questions that require deeper reasoning as well as questions with rare answers.
%R 10.18653/v1/2022.repl4nlp-1.15
%U https://aclanthology.org/2022.repl4nlp-1.15
%U https://doi.org/10.18653/v1/2022.repl4nlp-1.15
%P 143-155
Markdown (Informal)
[Video Language Co-Attention with Multimodal Fast-Learning Feature Fusion for VideoQA](https://aclanthology.org/2022.repl4nlp-1.15) (Abdessaied et al., RepL4NLP 2022)
ACL