@inproceedings{huang-etal-2019-multi-modal,
title = "Multi-modal Discriminative Model for Vision-and-Language Navigation",
author = "Huang, Haoshuo and
Jain, Vihan and
Mehta, Harsh and
Baldridge, Jason and
Ie, Eugene",
editor = "Bhatia, Archna and
Bisk, Yonatan and
Kordjamshidi, Parisa and
Thomason, Jesse",
booktitle = "Proceedings of the Combined Workshop on Spatial Language Understanding ({S}p{LU}) and Grounded Communication for Robotics ({R}obo{NLP})",
month = jun,
year = "2019",
address = "Minneapolis, Minnesota",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-1605",
doi = "10.18653/v1/W19-1605",
pages = "40--49",
abstract = "Vision-and-Language Navigation (VLN) is a natural language grounding task where agents have to interpret natural language instructions in the context of visual scenes in a dynamic environment to achieve prescribed navigation goals. Successful agents must have the ability to parse natural language of varying linguistic styles, ground them in potentially unfamiliar scenes, plan and react with ambiguous environmental feedback. Generalization ability is limited by the amount of human annotated data. In particular, paired vision-language sequence data is expensive to collect. We develop a discriminator that evaluates how well an instruction explains a given path in VLN task using multi-modal alignment. Our study reveals that only a small fraction of the high-quality augmented data from Fried et al., as scored by our discriminator, is useful for training VLN agents with similar performance. We also show that a VLN agent warm-started with pre-trained components from the discriminator outperforms the benchmark success rates of 35.5 by 10{\%} relative measure.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huang-etal-2019-multi-modal">
<titleInfo>
<title>Multi-modal Discriminative Model for Vision-and-Language Navigation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haoshuo</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vihan</namePart>
<namePart type="family">Jain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harsh</namePart>
<namePart type="family">Mehta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jason</namePart>
<namePart type="family">Baldridge</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugene</namePart>
<namePart type="family">Ie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Combined Workshop on Spatial Language Understanding (SpLU) and Grounded Communication for Robotics (RoboNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Archna</namePart>
<namePart type="family">Bhatia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Bisk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Parisa</namePart>
<namePart type="family">Kordjamshidi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jesse</namePart>
<namePart type="family">Thomason</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Minneapolis, Minnesota</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Vision-and-Language Navigation (VLN) is a natural language grounding task where agents have to interpret natural language instructions in the context of visual scenes in a dynamic environment to achieve prescribed navigation goals. Successful agents must have the ability to parse natural language of varying linguistic styles, ground them in potentially unfamiliar scenes, plan and react with ambiguous environmental feedback. Generalization ability is limited by the amount of human annotated data. In particular, paired vision-language sequence data is expensive to collect. We develop a discriminator that evaluates how well an instruction explains a given path in VLN task using multi-modal alignment. Our study reveals that only a small fraction of the high-quality augmented data from Fried et al., as scored by our discriminator, is useful for training VLN agents with similar performance. We also show that a VLN agent warm-started with pre-trained components from the discriminator outperforms the benchmark success rates of 35.5 by 10% relative measure.</abstract>
<identifier type="citekey">huang-etal-2019-multi-modal</identifier>
<identifier type="doi">10.18653/v1/W19-1605</identifier>
<location>
<url>https://aclanthology.org/W19-1605</url>
</location>
<part>
<date>2019-06</date>
<extent unit="page">
<start>40</start>
<end>49</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multi-modal Discriminative Model for Vision-and-Language Navigation
%A Huang, Haoshuo
%A Jain, Vihan
%A Mehta, Harsh
%A Baldridge, Jason
%A Ie, Eugene
%Y Bhatia, Archna
%Y Bisk, Yonatan
%Y Kordjamshidi, Parisa
%Y Thomason, Jesse
%S Proceedings of the Combined Workshop on Spatial Language Understanding (SpLU) and Grounded Communication for Robotics (RoboNLP)
%D 2019
%8 June
%I Association for Computational Linguistics
%C Minneapolis, Minnesota
%F huang-etal-2019-multi-modal
%X Vision-and-Language Navigation (VLN) is a natural language grounding task where agents have to interpret natural language instructions in the context of visual scenes in a dynamic environment to achieve prescribed navigation goals. Successful agents must have the ability to parse natural language of varying linguistic styles, ground them in potentially unfamiliar scenes, plan and react with ambiguous environmental feedback. Generalization ability is limited by the amount of human annotated data. In particular, paired vision-language sequence data is expensive to collect. We develop a discriminator that evaluates how well an instruction explains a given path in VLN task using multi-modal alignment. Our study reveals that only a small fraction of the high-quality augmented data from Fried et al., as scored by our discriminator, is useful for training VLN agents with similar performance. We also show that a VLN agent warm-started with pre-trained components from the discriminator outperforms the benchmark success rates of 35.5 by 10% relative measure.
%R 10.18653/v1/W19-1605
%U https://aclanthology.org/W19-1605
%U https://doi.org/10.18653/v1/W19-1605
%P 40-49
Markdown (Informal)
[Multi-modal Discriminative Model for Vision-and-Language Navigation](https://aclanthology.org/W19-1605) (Huang et al., RoboNLP 2019)
ACL
- Haoshuo Huang, Vihan Jain, Harsh Mehta, Jason Baldridge, and Eugene Ie. 2019. Multi-modal Discriminative Model for Vision-and-Language Navigation. In Proceedings of the Combined Workshop on Spatial Language Understanding (SpLU) and Grounded Communication for Robotics (RoboNLP), pages 40–49, Minneapolis, Minnesota. Association for Computational Linguistics.