@inproceedings{zhuo-etal-2023-vilpact,
title = "{V}i{LPA}ct: A Benchmark for Compositional Generalization on Multimodal Human Activities",
author = "Zhuo, Terry Yue and
Liao, Yaqing and
Lei, Yuecheng and
Qu, Lizhen and
de Melo, Gerard and
Chang, Xiaojun and
Ren, Yazhou and
Xu, Zenglin",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2023",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-eacl.164",
doi = "10.18653/v1/2023.findings-eacl.164",
pages = "2192--2207",
abstract = "We introduce $\texttt{ViLPAct}$, a novel vision-language benchmark for human activity planning. It is designed for a task where embodied AI agents can reason and forecast future actions of humans based on video clips about their initial activities and intents in text. The dataset consists of 2.9k videos from $\texttt{Charades}$ extended with intents via crowdsourcing, a multi-choice question test set, and four strong baselines. One of the baselines implements a neurosymbolic approach based on a multi-modal knowledge base (MKB), while the other ones are deep generative models adapted from recent state-of-the-art (SOTA) methods. According to our extensive experiments, the key challenges are compositional generalization and effective use of information from both modalities.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhuo-etal-2023-vilpact">
<titleInfo>
<title>ViLPAct: A Benchmark for Compositional Generalization on Multimodal Human Activities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Terry</namePart>
<namePart type="given">Yue</namePart>
<namePart type="family">Zhuo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaqing</namePart>
<namePart type="family">Liao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuecheng</namePart>
<namePart type="family">Lei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lizhen</namePart>
<namePart type="family">Qu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerard</namePart>
<namePart type="family">de Melo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaojun</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yazhou</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zenglin</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce ViLPAct, a novel vision-language benchmark for human activity planning. It is designed for a task where embodied AI agents can reason and forecast future actions of humans based on video clips about their initial activities and intents in text. The dataset consists of 2.9k videos from Charades extended with intents via crowdsourcing, a multi-choice question test set, and four strong baselines. One of the baselines implements a neurosymbolic approach based on a multi-modal knowledge base (MKB), while the other ones are deep generative models adapted from recent state-of-the-art (SOTA) methods. According to our extensive experiments, the key challenges are compositional generalization and effective use of information from both modalities.</abstract>
<identifier type="citekey">zhuo-etal-2023-vilpact</identifier>
<identifier type="doi">10.18653/v1/2023.findings-eacl.164</identifier>
<location>
<url>https://aclanthology.org/2023.findings-eacl.164</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>2192</start>
<end>2207</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ViLPAct: A Benchmark for Compositional Generalization on Multimodal Human Activities
%A Zhuo, Terry Yue
%A Liao, Yaqing
%A Lei, Yuecheng
%A Qu, Lizhen
%A de Melo, Gerard
%A Chang, Xiaojun
%A Ren, Yazhou
%A Xu, Zenglin
%Y Vlachos, Andreas
%Y Augenstein, Isabelle
%S Findings of the Association for Computational Linguistics: EACL 2023
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F zhuo-etal-2023-vilpact
%X We introduce ViLPAct, a novel vision-language benchmark for human activity planning. It is designed for a task where embodied AI agents can reason and forecast future actions of humans based on video clips about their initial activities and intents in text. The dataset consists of 2.9k videos from Charades extended with intents via crowdsourcing, a multi-choice question test set, and four strong baselines. One of the baselines implements a neurosymbolic approach based on a multi-modal knowledge base (MKB), while the other ones are deep generative models adapted from recent state-of-the-art (SOTA) methods. According to our extensive experiments, the key challenges are compositional generalization and effective use of information from both modalities.
%R 10.18653/v1/2023.findings-eacl.164
%U https://aclanthology.org/2023.findings-eacl.164
%U https://doi.org/10.18653/v1/2023.findings-eacl.164
%P 2192-2207
Markdown (Informal)
[ViLPAct: A Benchmark for Compositional Generalization on Multimodal Human Activities](https://aclanthology.org/2023.findings-eacl.164) (Zhuo et al., Findings 2023)
ACL
- Terry Yue Zhuo, Yaqing Liao, Yuecheng Lei, Lizhen Qu, Gerard de Melo, Xiaojun Chang, Yazhou Ren, and Zenglin Xu. 2023. ViLPAct: A Benchmark for Compositional Generalization on Multimodal Human Activities. In Findings of the Association for Computational Linguistics: EACL 2023, pages 2192–2207, Dubrovnik, Croatia. Association for Computational Linguistics.