@article{socher-etal-2014-grounded,
title = "Grounded Compositional Semantics for Finding and Describing Images with Sentences",
author = "Socher, Richard and
Karpathy, Andrej and
Le, Quoc V. and
Manning, Christopher D. and
Ng, Andrew Y.",
editor = "Lin, Dekang and
Collins, Michael and
Lee, Lillian",
journal = "Transactions of the Association for Computational Linguistics",
volume = "2",
year = "2014",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/Q14-1017",
doi = "10.1162/tacl_a_00177",
pages = "207--218",
abstract = "Previous work on Recursive Neural Networks (RNNs) shows that these models can produce compositional feature vectors for accurately representing and classifying sentences or images. However, the sentence vectors of previous models cannot accurately represent visually grounded meaning. We introduce the DT-RNN model which uses dependency trees to embed sentences into a vector space in order to retrieve images that are described by those sentences. Unlike previous RNN-based models which use constituency trees, DT-RNNs naturally focus on the action and agents in a sentence. They are better able to abstract from the details of word order and syntactic expression. DT-RNNs outperform other recursive and recurrent neural networks, kernelized CCA and a bag-of-words baseline on the tasks of finding an image that fits a sentence description and vice versa. They also give more similar representations to sentences that describe the same image.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="socher-etal-2014-grounded">
<titleInfo>
<title>Grounded Compositional Semantics for Finding and Describing Images with Sentences</title>
</titleInfo>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Socher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrej</namePart>
<namePart type="family">Karpathy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Quoc</namePart>
<namePart type="given">V</namePart>
<namePart type="family">Le</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="given">D</namePart>
<namePart type="family">Manning</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrew</namePart>
<namePart type="given">Y</namePart>
<namePart type="family">Ng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2014</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Previous work on Recursive Neural Networks (RNNs) shows that these models can produce compositional feature vectors for accurately representing and classifying sentences or images. However, the sentence vectors of previous models cannot accurately represent visually grounded meaning. We introduce the DT-RNN model which uses dependency trees to embed sentences into a vector space in order to retrieve images that are described by those sentences. Unlike previous RNN-based models which use constituency trees, DT-RNNs naturally focus on the action and agents in a sentence. They are better able to abstract from the details of word order and syntactic expression. DT-RNNs outperform other recursive and recurrent neural networks, kernelized CCA and a bag-of-words baseline on the tasks of finding an image that fits a sentence description and vice versa. They also give more similar representations to sentences that describe the same image.</abstract>
<identifier type="citekey">socher-etal-2014-grounded</identifier>
<identifier type="doi">10.1162/tacl_a_00177</identifier>
<location>
<url>https://aclanthology.org/Q14-1017</url>
</location>
<part>
<date>2014</date>
<detail type="volume"><number>2</number></detail>
<extent unit="page">
<start>207</start>
<end>218</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Grounded Compositional Semantics for Finding and Describing Images with Sentences
%A Socher, Richard
%A Karpathy, Andrej
%A Le, Quoc V.
%A Manning, Christopher D.
%A Ng, Andrew Y.
%J Transactions of the Association for Computational Linguistics
%D 2014
%V 2
%I MIT Press
%C Cambridge, MA
%F socher-etal-2014-grounded
%X Previous work on Recursive Neural Networks (RNNs) shows that these models can produce compositional feature vectors for accurately representing and classifying sentences or images. However, the sentence vectors of previous models cannot accurately represent visually grounded meaning. We introduce the DT-RNN model which uses dependency trees to embed sentences into a vector space in order to retrieve images that are described by those sentences. Unlike previous RNN-based models which use constituency trees, DT-RNNs naturally focus on the action and agents in a sentence. They are better able to abstract from the details of word order and syntactic expression. DT-RNNs outperform other recursive and recurrent neural networks, kernelized CCA and a bag-of-words baseline on the tasks of finding an image that fits a sentence description and vice versa. They also give more similar representations to sentences that describe the same image.
%R 10.1162/tacl_a_00177
%U https://aclanthology.org/Q14-1017
%U https://doi.org/10.1162/tacl_a_00177
%P 207-218
Markdown (Informal)
[Grounded Compositional Semantics for Finding and Describing Images with Sentences](https://aclanthology.org/Q14-1017) (Socher et al., TACL 2014)
ACL