@inproceedings{wu-etal-2023-localizing,
title = "Localizing Active Objects from Egocentric Vision with Symbolic World Knowledge",
author = "Wu, Te-Lin and
Zhou, Yu and
Peng, Nanyun",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.304",
doi = "10.18653/v1/2023.emnlp-main.304",
pages = "4991--5006",
abstract = "The ability to actively ground task instructions from an egocentric view is crucial for AI agents to accomplish tasks or assist humans virtually. One important step towards this goal is to localize and track key active objects that undergo major state change as a consequence of human actions/interactions to the environment without being told exactly what/where to ground (e.g., localizing and tracking the {`}sponge{`} in video from the instruction {``}Dip the sponge into the bucket.{''}). While existing works approach this problem from a pure vision perspective, we investigate to which extent the textual modality (i.e., task instructions) and their interaction with visual modality can be beneficial. Specifically, we propose to improve phrase grounding models{'} ability on localizing the active objects by: (1) learning the role of {`}objects undergoing change{`} and extracting them accurately from the instructions, (2) leveraging pre- and post-conditions of the objects during actions, and (3) recognizing the objects more robustly with descriptional knowledge. We leverage large language models (LLMs) to extract the aforementioned action-object knowledge, and design a per-object aggregation masking technique to effectively perform joint inference on object phrases and symbolic knowledge. We evaluate our framework on Ego4D and Epic-Kitchens datasets. Extensive experiments demonstrate the effectiveness of our proposed framework, which leads to{\textgreater}54{\%} improvements in all standard metrics on the TREK-150-OPE-Det localization + tracking task, {\textgreater}7{\%} improvements in all standard metrics on the TREK-150-OPE tracking task, and {\textgreater}3{\%} improvements in average precision (AP) on the Ego4D SCOD task.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wu-etal-2023-localizing">
<titleInfo>
<title>Localizing Active Objects from Egocentric Vision with Symbolic World Knowledge</title>
</titleInfo>
<name type="personal">
<namePart type="given">Te-Lin</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nanyun</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The ability to actively ground task instructions from an egocentric view is crucial for AI agents to accomplish tasks or assist humans virtually. One important step towards this goal is to localize and track key active objects that undergo major state change as a consequence of human actions/interactions to the environment without being told exactly what/where to ground (e.g., localizing and tracking the ‘sponge‘ in video from the instruction “Dip the sponge into the bucket.”). While existing works approach this problem from a pure vision perspective, we investigate to which extent the textual modality (i.e., task instructions) and their interaction with visual modality can be beneficial. Specifically, we propose to improve phrase grounding models’ ability on localizing the active objects by: (1) learning the role of ‘objects undergoing change‘ and extracting them accurately from the instructions, (2) leveraging pre- and post-conditions of the objects during actions, and (3) recognizing the objects more robustly with descriptional knowledge. We leverage large language models (LLMs) to extract the aforementioned action-object knowledge, and design a per-object aggregation masking technique to effectively perform joint inference on object phrases and symbolic knowledge. We evaluate our framework on Ego4D and Epic-Kitchens datasets. Extensive experiments demonstrate the effectiveness of our proposed framework, which leads to\textgreater54% improvements in all standard metrics on the TREK-150-OPE-Det localization + tracking task, \textgreater7% improvements in all standard metrics on the TREK-150-OPE tracking task, and \textgreater3% improvements in average precision (AP) on the Ego4D SCOD task.</abstract>
<identifier type="citekey">wu-etal-2023-localizing</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.304</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.304</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>4991</start>
<end>5006</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Localizing Active Objects from Egocentric Vision with Symbolic World Knowledge
%A Wu, Te-Lin
%A Zhou, Yu
%A Peng, Nanyun
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F wu-etal-2023-localizing
%X The ability to actively ground task instructions from an egocentric view is crucial for AI agents to accomplish tasks or assist humans virtually. One important step towards this goal is to localize and track key active objects that undergo major state change as a consequence of human actions/interactions to the environment without being told exactly what/where to ground (e.g., localizing and tracking the ‘sponge‘ in video from the instruction “Dip the sponge into the bucket.”). While existing works approach this problem from a pure vision perspective, we investigate to which extent the textual modality (i.e., task instructions) and their interaction with visual modality can be beneficial. Specifically, we propose to improve phrase grounding models’ ability on localizing the active objects by: (1) learning the role of ‘objects undergoing change‘ and extracting them accurately from the instructions, (2) leveraging pre- and post-conditions of the objects during actions, and (3) recognizing the objects more robustly with descriptional knowledge. We leverage large language models (LLMs) to extract the aforementioned action-object knowledge, and design a per-object aggregation masking technique to effectively perform joint inference on object phrases and symbolic knowledge. We evaluate our framework on Ego4D and Epic-Kitchens datasets. Extensive experiments demonstrate the effectiveness of our proposed framework, which leads to\textgreater54% improvements in all standard metrics on the TREK-150-OPE-Det localization + tracking task, \textgreater7% improvements in all standard metrics on the TREK-150-OPE tracking task, and \textgreater3% improvements in average precision (AP) on the Ego4D SCOD task.
%R 10.18653/v1/2023.emnlp-main.304
%U https://aclanthology.org/2023.emnlp-main.304
%U https://doi.org/10.18653/v1/2023.emnlp-main.304
%P 4991-5006
Markdown (Informal)
[Localizing Active Objects from Egocentric Vision with Symbolic World Knowledge](https://aclanthology.org/2023.emnlp-main.304) (Wu et al., EMNLP 2023)
ACL