@article{collell-moens-2018-learning,
title = "Learning Representations Specialized in Spatial Knowledge: Leveraging Language and Vision",
author = "Collell, Guillem and
Moens, Marie-Francine",
editor = "Lee, Lillian and
Johnson, Mark and
Toutanova, Kristina and
Roark, Brian",
journal = "Transactions of the Association for Computational Linguistics",
volume = "6",
year = "2018",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/Q18-1010/",
doi = "10.1162/tacl_a_00010",
pages = "133--144",
abstract = "Spatial understanding is crucial in many real-world problems, yet little progress has been made towards building representations that capture spatial knowledge. Here, we move one step forward in this direction and learn such representations by leveraging a task consisting in predicting continuous 2D spatial arrangements of objects given object-relationship-object instances (e.g., {\textquotedblleft}cat under chair{\textquotedblright}) and a simple neural network model that learns the task from annotated images. We show that the model succeeds in this task and, furthermore, that it is capable of predicting correct spatial arrangements for unseen objects if either CNN features or word embeddings of the objects are provided. The differences between visual and linguistic features are discussed. Next, to evaluate the spatial representations learned in the previous task, we introduce a task and a dataset consisting in a set of crowdsourced human ratings of spatial similarity for object pairs. We find that both CNN (convolutional neural network) features and word embeddings predict human judgments of similarity well and that these vectors can be further specialized in spatial knowledge if we update them when training the model that predicts spatial arrangements of objects. Overall, this paper paves the way towards building distributed spatial representations, contributing to the understanding of spatial expressions in language."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="collell-moens-2018-learning">
<titleInfo>
<title>Learning Representations Specialized in Spatial Knowledge: Leveraging Language and Vision</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guillem</namePart>
<namePart type="family">Collell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Spatial understanding is crucial in many real-world problems, yet little progress has been made towards building representations that capture spatial knowledge. Here, we move one step forward in this direction and learn such representations by leveraging a task consisting in predicting continuous 2D spatial arrangements of objects given object-relationship-object instances (e.g., “cat under chair”) and a simple neural network model that learns the task from annotated images. We show that the model succeeds in this task and, furthermore, that it is capable of predicting correct spatial arrangements for unseen objects if either CNN features or word embeddings of the objects are provided. The differences between visual and linguistic features are discussed. Next, to evaluate the spatial representations learned in the previous task, we introduce a task and a dataset consisting in a set of crowdsourced human ratings of spatial similarity for object pairs. We find that both CNN (convolutional neural network) features and word embeddings predict human judgments of similarity well and that these vectors can be further specialized in spatial knowledge if we update them when training the model that predicts spatial arrangements of objects. Overall, this paper paves the way towards building distributed spatial representations, contributing to the understanding of spatial expressions in language.</abstract>
<identifier type="citekey">collell-moens-2018-learning</identifier>
<identifier type="doi">10.1162/tacl_a_00010</identifier>
<location>
<url>https://aclanthology.org/Q18-1010/</url>
</location>
<part>
<date>2018</date>
<detail type="volume"><number>6</number></detail>
<extent unit="page">
<start>133</start>
<end>144</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Learning Representations Specialized in Spatial Knowledge: Leveraging Language and Vision
%A Collell, Guillem
%A Moens, Marie-Francine
%J Transactions of the Association for Computational Linguistics
%D 2018
%V 6
%I MIT Press
%C Cambridge, MA
%F collell-moens-2018-learning
%X Spatial understanding is crucial in many real-world problems, yet little progress has been made towards building representations that capture spatial knowledge. Here, we move one step forward in this direction and learn such representations by leveraging a task consisting in predicting continuous 2D spatial arrangements of objects given object-relationship-object instances (e.g., “cat under chair”) and a simple neural network model that learns the task from annotated images. We show that the model succeeds in this task and, furthermore, that it is capable of predicting correct spatial arrangements for unseen objects if either CNN features or word embeddings of the objects are provided. The differences between visual and linguistic features are discussed. Next, to evaluate the spatial representations learned in the previous task, we introduce a task and a dataset consisting in a set of crowdsourced human ratings of spatial similarity for object pairs. We find that both CNN (convolutional neural network) features and word embeddings predict human judgments of similarity well and that these vectors can be further specialized in spatial knowledge if we update them when training the model that predicts spatial arrangements of objects. Overall, this paper paves the way towards building distributed spatial representations, contributing to the understanding of spatial expressions in language.
%R 10.1162/tacl_a_00010
%U https://aclanthology.org/Q18-1010/
%U https://doi.org/10.1162/tacl_a_00010
%P 133-144
Markdown (Informal)
[Learning Representations Specialized in Spatial Knowledge: Leveraging Language and Vision](https://aclanthology.org/Q18-1010/) (Collell & Moens, TACL 2018)
ACL