@inproceedings{thomason-etal-2017-guiding,
title = "Guiding Interaction Behaviors for Multi-modal Grounded Language Learning",
author = "Thomason, Jesse and
Sinapov, Jivko and
Mooney, Raymond",
editor = "Bansal, Mohit and
Matuszek, Cynthia and
Andreas, Jacob and
Artzi, Yoav and
Bisk, Yonatan",
booktitle = "Proceedings of the First Workshop on Language Grounding for Robotics",
month = aug,
year = "2017",
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W17-2803",
doi = "10.18653/v1/W17-2803",
pages = "20--24",
abstract = "Multi-modal grounded language learning connects language predicates to physical properties of objects in the world. Sensing with multiple modalities, such as audio, haptics, and visual colors and shapes while performing interaction behaviors like lifting, dropping, and looking on objects enables a robot to ground non-visual predicates like {``}empty{''} as well as visual predicates like {``}red{''}. Previous work has established that grounding in multi-modal space improves performance on object retrieval from human descriptions. In this work, we gather behavior annotations from humans and demonstrate that these improve language grounding performance by allowing a system to focus on relevant behaviors for words like {``}white{''} or {``}half-full{''} that can be understood by looking or lifting, respectively. We also explore adding modality annotations (whether to focus on audio or haptics when performing a behavior), which improves performance, and sharing information between linguistically related predicates (if {``}green{''} is a color, {``}white{''} is a color), which improves grounding recall but at the cost of precision.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="thomason-etal-2017-guiding">
<titleInfo>
<title>Guiding Interaction Behaviors for Multi-modal Grounded Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jesse</namePart>
<namePart type="family">Thomason</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jivko</namePart>
<namePart type="family">Sinapov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raymond</namePart>
<namePart type="family">Mooney</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Language Grounding for Robotics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cynthia</namePart>
<namePart type="family">Matuszek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jacob</namePart>
<namePart type="family">Andreas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Artzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Bisk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vancouver, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multi-modal grounded language learning connects language predicates to physical properties of objects in the world. Sensing with multiple modalities, such as audio, haptics, and visual colors and shapes while performing interaction behaviors like lifting, dropping, and looking on objects enables a robot to ground non-visual predicates like “empty” as well as visual predicates like “red”. Previous work has established that grounding in multi-modal space improves performance on object retrieval from human descriptions. In this work, we gather behavior annotations from humans and demonstrate that these improve language grounding performance by allowing a system to focus on relevant behaviors for words like “white” or “half-full” that can be understood by looking or lifting, respectively. We also explore adding modality annotations (whether to focus on audio or haptics when performing a behavior), which improves performance, and sharing information between linguistically related predicates (if “green” is a color, “white” is a color), which improves grounding recall but at the cost of precision.</abstract>
<identifier type="citekey">thomason-etal-2017-guiding</identifier>
<identifier type="doi">10.18653/v1/W17-2803</identifier>
<location>
<url>https://aclanthology.org/W17-2803</url>
</location>
<part>
<date>2017-08</date>
<extent unit="page">
<start>20</start>
<end>24</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Guiding Interaction Behaviors for Multi-modal Grounded Language Learning
%A Thomason, Jesse
%A Sinapov, Jivko
%A Mooney, Raymond
%Y Bansal, Mohit
%Y Matuszek, Cynthia
%Y Andreas, Jacob
%Y Artzi, Yoav
%Y Bisk, Yonatan
%S Proceedings of the First Workshop on Language Grounding for Robotics
%D 2017
%8 August
%I Association for Computational Linguistics
%C Vancouver, Canada
%F thomason-etal-2017-guiding
%X Multi-modal grounded language learning connects language predicates to physical properties of objects in the world. Sensing with multiple modalities, such as audio, haptics, and visual colors and shapes while performing interaction behaviors like lifting, dropping, and looking on objects enables a robot to ground non-visual predicates like “empty” as well as visual predicates like “red”. Previous work has established that grounding in multi-modal space improves performance on object retrieval from human descriptions. In this work, we gather behavior annotations from humans and demonstrate that these improve language grounding performance by allowing a system to focus on relevant behaviors for words like “white” or “half-full” that can be understood by looking or lifting, respectively. We also explore adding modality annotations (whether to focus on audio or haptics when performing a behavior), which improves performance, and sharing information between linguistically related predicates (if “green” is a color, “white” is a color), which improves grounding recall but at the cost of precision.
%R 10.18653/v1/W17-2803
%U https://aclanthology.org/W17-2803
%U https://doi.org/10.18653/v1/W17-2803
%P 20-24
Markdown (Informal)
[Guiding Interaction Behaviors for Multi-modal Grounded Language Learning](https://aclanthology.org/W17-2803) (Thomason et al., RoboNLP 2017)
ACL