@inproceedings{jansen-2020-visually,
title = "Visually-Grounded Planning without Vision: Language Models Infer Detailed Plans from High-level Instructions",
author = "Jansen, Peter",
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.findings-emnlp.395/",
doi = "10.18653/v1/2020.findings-emnlp.395",
pages = "4412--4417",
abstract = "The recently proposed ALFRED challenge task aims for a virtual robotic agent to complete complex multi-step everyday tasks in a virtual home environment from high-level natural language directives, such as {\textquotedblleft}put a hot piece of bread on a plate{\textquotedblright}. Currently, the best-performing models are able to complete less than 1{\%} of these tasks successfully. In this work we focus on modeling the translation problem of converting natural language directives into detailed multi-step sequences of actions that accomplish those goals in the virtual environment. We empirically demonstrate that it is possible to generate gold multi-step plans from language directives alone without any visual input in 26{\%} of unseen cases. When a small amount of visual information, the starting location in the virtual environment, is incorporated, our best-performing GPT-2 model successfully generates gold command sequences in 58{\%} of cases, suggesting contextualized language models may provide strong planning modules for grounded virtual agents."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jansen-2020-visually">
<titleInfo>
<title>Visually-Grounded Planning without Vision: Language Models Infer Detailed Plans from High-level Instructions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Jansen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2020</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The recently proposed ALFRED challenge task aims for a virtual robotic agent to complete complex multi-step everyday tasks in a virtual home environment from high-level natural language directives, such as “put a hot piece of bread on a plate”. Currently, the best-performing models are able to complete less than 1% of these tasks successfully. In this work we focus on modeling the translation problem of converting natural language directives into detailed multi-step sequences of actions that accomplish those goals in the virtual environment. We empirically demonstrate that it is possible to generate gold multi-step plans from language directives alone without any visual input in 26% of unseen cases. When a small amount of visual information, the starting location in the virtual environment, is incorporated, our best-performing GPT-2 model successfully generates gold command sequences in 58% of cases, suggesting contextualized language models may provide strong planning modules for grounded virtual agents.</abstract>
<identifier type="citekey">jansen-2020-visually</identifier>
<identifier type="doi">10.18653/v1/2020.findings-emnlp.395</identifier>
<location>
<url>https://aclanthology.org/2020.findings-emnlp.395/</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>4412</start>
<end>4417</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Visually-Grounded Planning without Vision: Language Models Infer Detailed Plans from High-level Instructions
%A Jansen, Peter
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Findings of the Association for Computational Linguistics: EMNLP 2020
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F jansen-2020-visually
%X The recently proposed ALFRED challenge task aims for a virtual robotic agent to complete complex multi-step everyday tasks in a virtual home environment from high-level natural language directives, such as “put a hot piece of bread on a plate”. Currently, the best-performing models are able to complete less than 1% of these tasks successfully. In this work we focus on modeling the translation problem of converting natural language directives into detailed multi-step sequences of actions that accomplish those goals in the virtual environment. We empirically demonstrate that it is possible to generate gold multi-step plans from language directives alone without any visual input in 26% of unseen cases. When a small amount of visual information, the starting location in the virtual environment, is incorporated, our best-performing GPT-2 model successfully generates gold command sequences in 58% of cases, suggesting contextualized language models may provide strong planning modules for grounded virtual agents.
%R 10.18653/v1/2020.findings-emnlp.395
%U https://aclanthology.org/2020.findings-emnlp.395/
%U https://doi.org/10.18653/v1/2020.findings-emnlp.395
%P 4412-4417
Markdown (Informal)
[Visually-Grounded Planning without Vision: Language Models Infer Detailed Plans from High-level Instructions](https://aclanthology.org/2020.findings-emnlp.395/) (Jansen, Findings 2020)
ACL