@inproceedings{hristov-etal-2017-grounding,
title = "Grounding Symbols in Multi-Modal Instructions",
author = "Hristov, Yordan and
Penkov, Svetlin and
Lascarides, Alex and
Ramamoorthy, Subramanian",
editor = "Bansal, Mohit and
Matuszek, Cynthia and
Andreas, Jacob and
Artzi, Yoav and
Bisk, Yonatan",
booktitle = "Proceedings of the First Workshop on Language Grounding for Robotics",
month = aug,
year = "2017",
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W17-2807",
doi = "10.18653/v1/W17-2807",
pages = "49--57",
abstract = "As robots begin to cohabit with humans in semi-structured environments, the need arises to understand instructions involving rich variability{---}for instance, learning to ground symbols in the physical world. Realistically, this task must cope with small datasets consisting of a particular users{'} contextual assignment of meaning to terms. We present a method for processing a raw stream of cross-modal input{---}i.e., linguistic instructions, visual perception of a scene and a concurrent trace of 3D eye tracking fixations{---}to produce the segmentation of objects with a correspondent association to high-level concepts. To test our framework we present experiments in a table-top object manipulation scenario. Our results show our model learns the user{'}s notion of colour and shape from a small number of physical demonstrations, generalising to identifying physical referents for novel combinations of the words.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hristov-etal-2017-grounding">
<titleInfo>
<title>Grounding Symbols in Multi-Modal Instructions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yordan</namePart>
<namePart type="family">Hristov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Svetlin</namePart>
<namePart type="family">Penkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Lascarides</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Subramanian</namePart>
<namePart type="family">Ramamoorthy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Language Grounding for Robotics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cynthia</namePart>
<namePart type="family">Matuszek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jacob</namePart>
<namePart type="family">Andreas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Artzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Bisk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vancouver, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>As robots begin to cohabit with humans in semi-structured environments, the need arises to understand instructions involving rich variability—for instance, learning to ground symbols in the physical world. Realistically, this task must cope with small datasets consisting of a particular users’ contextual assignment of meaning to terms. We present a method for processing a raw stream of cross-modal input—i.e., linguistic instructions, visual perception of a scene and a concurrent trace of 3D eye tracking fixations—to produce the segmentation of objects with a correspondent association to high-level concepts. To test our framework we present experiments in a table-top object manipulation scenario. Our results show our model learns the user’s notion of colour and shape from a small number of physical demonstrations, generalising to identifying physical referents for novel combinations of the words.</abstract>
<identifier type="citekey">hristov-etal-2017-grounding</identifier>
<identifier type="doi">10.18653/v1/W17-2807</identifier>
<location>
<url>https://aclanthology.org/W17-2807</url>
</location>
<part>
<date>2017-08</date>
<extent unit="page">
<start>49</start>
<end>57</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Grounding Symbols in Multi-Modal Instructions
%A Hristov, Yordan
%A Penkov, Svetlin
%A Lascarides, Alex
%A Ramamoorthy, Subramanian
%Y Bansal, Mohit
%Y Matuszek, Cynthia
%Y Andreas, Jacob
%Y Artzi, Yoav
%Y Bisk, Yonatan
%S Proceedings of the First Workshop on Language Grounding for Robotics
%D 2017
%8 August
%I Association for Computational Linguistics
%C Vancouver, Canada
%F hristov-etal-2017-grounding
%X As robots begin to cohabit with humans in semi-structured environments, the need arises to understand instructions involving rich variability—for instance, learning to ground symbols in the physical world. Realistically, this task must cope with small datasets consisting of a particular users’ contextual assignment of meaning to terms. We present a method for processing a raw stream of cross-modal input—i.e., linguistic instructions, visual perception of a scene and a concurrent trace of 3D eye tracking fixations—to produce the segmentation of objects with a correspondent association to high-level concepts. To test our framework we present experiments in a table-top object manipulation scenario. Our results show our model learns the user’s notion of colour and shape from a small number of physical demonstrations, generalising to identifying physical referents for novel combinations of the words.
%R 10.18653/v1/W17-2807
%U https://aclanthology.org/W17-2807
%U https://doi.org/10.18653/v1/W17-2807
%P 49-57
Markdown (Informal)
[Grounding Symbols in Multi-Modal Instructions](https://aclanthology.org/W17-2807) (Hristov et al., RoboNLP 2017)
ACL
- Yordan Hristov, Svetlin Penkov, Alex Lascarides, and Subramanian Ramamoorthy. 2017. Grounding Symbols in Multi-Modal Instructions. In Proceedings of the First Workshop on Language Grounding for Robotics, pages 49–57, Vancouver, Canada. Association for Computational Linguistics.