@inproceedings{sadhu-etal-2021-video,
title = "Video Question Answering with Phrases via Semantic Roles",
author = "Sadhu, Arka and
Chen, Kan and
Nevatia, Ram",
editor = "Toutanova, Kristina and
Rumshisky, Anna and
Zettlemoyer, Luke and
Hakkani-Tur, Dilek and
Beltagy, Iz and
Bethard, Steven and
Cotterell, Ryan and
Chakraborty, Tanmoy and
Zhou, Yichao",
booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.naacl-main.196",
doi = "10.18653/v1/2021.naacl-main.196",
pages = "2460--2478",
abstract = "Video Question Answering (VidQA) evaluation metrics have been limited to a single-word answer or selecting a phrase from a fixed set of phrases. These metrics limit the VidQA models{'} application scenario. In this work, we leverage semantic roles derived from video descriptions to mask out certain phrases, to introduce VidQAP which poses VidQA as a fill-in-the-phrase task. To enable evaluation of answer phrases, we compute the relative improvement of the predicted answer compared to an empty string. To reduce the influence of language bias in VidQA datasets, we retrieve a video having a different answer for the same question. To facilitate research, we construct ActivityNet-SRL-QA and Charades-SRL-QA and benchmark them by extending three vision-language models. We perform extensive analysis and ablative studies to guide future work. Code and data are public.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sadhu-etal-2021-video">
<titleInfo>
<title>Video Question Answering with Phrases via Semantic Roles</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arka</namePart>
<namePart type="family">Sadhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kan</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ram</namePart>
<namePart type="family">Nevatia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kristina</namePart>
<namePart type="family">Toutanova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rumshisky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luke</namePart>
<namePart type="family">Zettlemoyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dilek</namePart>
<namePart type="family">Hakkani-Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iz</namePart>
<namePart type="family">Beltagy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yichao</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Video Question Answering (VidQA) evaluation metrics have been limited to a single-word answer or selecting a phrase from a fixed set of phrases. These metrics limit the VidQA models’ application scenario. In this work, we leverage semantic roles derived from video descriptions to mask out certain phrases, to introduce VidQAP which poses VidQA as a fill-in-the-phrase task. To enable evaluation of answer phrases, we compute the relative improvement of the predicted answer compared to an empty string. To reduce the influence of language bias in VidQA datasets, we retrieve a video having a different answer for the same question. To facilitate research, we construct ActivityNet-SRL-QA and Charades-SRL-QA and benchmark them by extending three vision-language models. We perform extensive analysis and ablative studies to guide future work. Code and data are public.</abstract>
<identifier type="citekey">sadhu-etal-2021-video</identifier>
<identifier type="doi">10.18653/v1/2021.naacl-main.196</identifier>
<location>
<url>https://aclanthology.org/2021.naacl-main.196</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>2460</start>
<end>2478</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Video Question Answering with Phrases via Semantic Roles
%A Sadhu, Arka
%A Chen, Kan
%A Nevatia, Ram
%Y Toutanova, Kristina
%Y Rumshisky, Anna
%Y Zettlemoyer, Luke
%Y Hakkani-Tur, Dilek
%Y Beltagy, Iz
%Y Bethard, Steven
%Y Cotterell, Ryan
%Y Chakraborty, Tanmoy
%Y Zhou, Yichao
%S Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F sadhu-etal-2021-video
%X Video Question Answering (VidQA) evaluation metrics have been limited to a single-word answer or selecting a phrase from a fixed set of phrases. These metrics limit the VidQA models’ application scenario. In this work, we leverage semantic roles derived from video descriptions to mask out certain phrases, to introduce VidQAP which poses VidQA as a fill-in-the-phrase task. To enable evaluation of answer phrases, we compute the relative improvement of the predicted answer compared to an empty string. To reduce the influence of language bias in VidQA datasets, we retrieve a video having a different answer for the same question. To facilitate research, we construct ActivityNet-SRL-QA and Charades-SRL-QA and benchmark them by extending three vision-language models. We perform extensive analysis and ablative studies to guide future work. Code and data are public.
%R 10.18653/v1/2021.naacl-main.196
%U https://aclanthology.org/2021.naacl-main.196
%U https://doi.org/10.18653/v1/2021.naacl-main.196
%P 2460-2478
Markdown (Informal)
[Video Question Answering with Phrases via Semantic Roles](https://aclanthology.org/2021.naacl-main.196) (Sadhu et al., NAACL 2021)
ACL
- Arka Sadhu, Kan Chen, and Ram Nevatia. 2021. Video Question Answering with Phrases via Semantic Roles. In Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pages 2460–2478, Online. Association for Computational Linguistics.