@inproceedings{kondapally-etal-2024-action,
title = "Action Inference for Destination Prediction in Vision-and-Language Navigation",
author = "Kondapally, Anirudh and
Yamada, Kentaro and
Yanaka, Hitomi",
editor = "Fu, Xiyan and
Fleisig, Eve",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-srw.26/",
doi = "10.18653/v1/2024.acl-srw.26",
pages = "192--199",
abstract = "Vision-and-Language Navigation (VLN) encompasses interacting with autonomous vehicles using language and visual input from the perspective of mobility.Most of the previous work in this field focuses on spatial reasoning and the semantic grounding of visual information.However, reasoning based on the actions of pedestrians in the scene is not much considered.In this study, we provide a VLN dataset for destination prediction with action inference to investigate the extent to which current VLN models perform action inference.We introduce a crowd-sourcing process to construct a dataset for this task in two steps: (1) collecting beliefs about the next action for a pedestrian and (2) annotating the destination considering the pedestrian`s next action.Our benchmarking results of the models on destination prediction lead us to believe that the models can learn to reason about the effect of the action and the next action on the destination to a certain extent.However, there is still much scope for improvement."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kondapally-etal-2024-action">
<titleInfo>
<title>Action Inference for Destination Prediction in Vision-and-Language Navigation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anirudh</namePart>
<namePart type="family">Kondapally</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Yamada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitomi</namePart>
<namePart type="family">Yanaka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiyan</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eve</namePart>
<namePart type="family">Fleisig</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Vision-and-Language Navigation (VLN) encompasses interacting with autonomous vehicles using language and visual input from the perspective of mobility.Most of the previous work in this field focuses on spatial reasoning and the semantic grounding of visual information.However, reasoning based on the actions of pedestrians in the scene is not much considered.In this study, we provide a VLN dataset for destination prediction with action inference to investigate the extent to which current VLN models perform action inference.We introduce a crowd-sourcing process to construct a dataset for this task in two steps: (1) collecting beliefs about the next action for a pedestrian and (2) annotating the destination considering the pedestrian‘s next action.Our benchmarking results of the models on destination prediction lead us to believe that the models can learn to reason about the effect of the action and the next action on the destination to a certain extent.However, there is still much scope for improvement.</abstract>
<identifier type="citekey">kondapally-etal-2024-action</identifier>
<identifier type="doi">10.18653/v1/2024.acl-srw.26</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-srw.26/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>192</start>
<end>199</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Action Inference for Destination Prediction in Vision-and-Language Navigation
%A Kondapally, Anirudh
%A Yamada, Kentaro
%A Yanaka, Hitomi
%Y Fu, Xiyan
%Y Fleisig, Eve
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F kondapally-etal-2024-action
%X Vision-and-Language Navigation (VLN) encompasses interacting with autonomous vehicles using language and visual input from the perspective of mobility.Most of the previous work in this field focuses on spatial reasoning and the semantic grounding of visual information.However, reasoning based on the actions of pedestrians in the scene is not much considered.In this study, we provide a VLN dataset for destination prediction with action inference to investigate the extent to which current VLN models perform action inference.We introduce a crowd-sourcing process to construct a dataset for this task in two steps: (1) collecting beliefs about the next action for a pedestrian and (2) annotating the destination considering the pedestrian‘s next action.Our benchmarking results of the models on destination prediction lead us to believe that the models can learn to reason about the effect of the action and the next action on the destination to a certain extent.However, there is still much scope for improvement.
%R 10.18653/v1/2024.acl-srw.26
%U https://aclanthology.org/2024.luhme-srw.26/
%U https://doi.org/10.18653/v1/2024.acl-srw.26
%P 192-199
Markdown (Informal)
[Action Inference for Destination Prediction in Vision-and-Language Navigation](https://aclanthology.org/2024.luhme-srw.26/) (Kondapally et al., ACL 2024)
ACL