@inproceedings{xu-etal-2024-exploring-question,
title = "Exploring Question Guidance and Answer Calibration for Visually Grounded Video Question Answering",
author = "Xu, Yuanxing and
Wei, Yuting and
Zhong, Shuai and
Chen, Xinming and
Qi, Jinsheng and
Wu, Bin",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.176",
pages = "3121--3133",
abstract = "Video Question Answering (VideoQA) tasks require not only correct answers but also visual evidence. The {``}localize-then-answer{''} strategy, while enhancing accuracy and interpretability, faces challenges due to the lack of temporal localization labels in VideoQA datasets. Existing methods often train the models{'} localization capabilities indirectly using QA labels, leading to inaccurate localization. Moreover, our experiments show that despite high accuracy, current models depend too heavily on language shortcuts or spurious correlations with irrelevant visual context. To address these issues, we propose a Question-Guided and Answer-Calibrated TRansformer (QGAC-TR), which guides and calibrates localization using question and option texts without localization labels. Furthermore, we design two self-supervised learning tasks to further enhance the model{'}s refined localization capabilities. Extensive experiments on three public datasets focused on temporal and causal reasoning show that our model not only achieves accuracy comparable to large-scale pretrained models but also leads in localization aspects. Code will be available on GitHub.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xu-etal-2024-exploring-question">
<titleInfo>
<title>Exploring Question Guidance and Answer Calibration for Visually Grounded Video Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuanxing</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuting</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuai</namePart>
<namePart type="family">Zhong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinming</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinsheng</namePart>
<namePart type="family">Qi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bin</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Video Question Answering (VideoQA) tasks require not only correct answers but also visual evidence. The “localize-then-answer” strategy, while enhancing accuracy and interpretability, faces challenges due to the lack of temporal localization labels in VideoQA datasets. Existing methods often train the models’ localization capabilities indirectly using QA labels, leading to inaccurate localization. Moreover, our experiments show that despite high accuracy, current models depend too heavily on language shortcuts or spurious correlations with irrelevant visual context. To address these issues, we propose a Question-Guided and Answer-Calibrated TRansformer (QGAC-TR), which guides and calibrates localization using question and option texts without localization labels. Furthermore, we design two self-supervised learning tasks to further enhance the model’s refined localization capabilities. Extensive experiments on three public datasets focused on temporal and causal reasoning show that our model not only achieves accuracy comparable to large-scale pretrained models but also leads in localization aspects. Code will be available on GitHub.</abstract>
<identifier type="citekey">xu-etal-2024-exploring-question</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.176</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>3121</start>
<end>3133</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Exploring Question Guidance and Answer Calibration for Visually Grounded Video Question Answering
%A Xu, Yuanxing
%A Wei, Yuting
%A Zhong, Shuai
%A Chen, Xinming
%A Qi, Jinsheng
%A Wu, Bin
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F xu-etal-2024-exploring-question
%X Video Question Answering (VideoQA) tasks require not only correct answers but also visual evidence. The “localize-then-answer” strategy, while enhancing accuracy and interpretability, faces challenges due to the lack of temporal localization labels in VideoQA datasets. Existing methods often train the models’ localization capabilities indirectly using QA labels, leading to inaccurate localization. Moreover, our experiments show that despite high accuracy, current models depend too heavily on language shortcuts or spurious correlations with irrelevant visual context. To address these issues, we propose a Question-Guided and Answer-Calibrated TRansformer (QGAC-TR), which guides and calibrates localization using question and option texts without localization labels. Furthermore, we design two self-supervised learning tasks to further enhance the model’s refined localization capabilities. Extensive experiments on three public datasets focused on temporal and causal reasoning show that our model not only achieves accuracy comparable to large-scale pretrained models but also leads in localization aspects. Code will be available on GitHub.
%U https://aclanthology.org/2024.findings-emnlp.176
%P 3121-3133
Markdown (Informal)
[Exploring Question Guidance and Answer Calibration for Visually Grounded Video Question Answering](https://aclanthology.org/2024.findings-emnlp.176) (Xu et al., Findings 2024)
ACL