@inproceedings{kiela-etal-2018-learning,
title = "Learning Visually Grounded Sentence Representations",
author = "Kiela, Douwe and
Conneau, Alexis and
Jabri, Allan and
Nickel, Maximilian",
editor = "Walker, Marilyn and
Ji, Heng and
Stent, Amanda",
booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
month = jun,
year = "2018",
address = "New Orleans, Louisiana",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/N18-1038",
doi = "10.18653/v1/N18-1038",
pages = "408--418",
abstract = "We investigate grounded sentence representations, where we train a sentence encoder to predict the image features of a given caption{---}i.e., we try to {``}imagine{''} how a sentence would be depicted visually{---}and use the resultant features as sentence representations. We examine the quality of the learned representations on a variety of standard sentence representation quality benchmarks, showing improved performance for grounded models over non-grounded ones. In addition, we thoroughly analyze the extent to which grounding contributes to improved performance, and show that the system also learns improved word embeddings.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kiela-etal-2018-learning">
<titleInfo>
<title>Learning Visually Grounded Sentence Representations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Douwe</namePart>
<namePart type="family">Kiela</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexis</namePart>
<namePart type="family">Conneau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Allan</namePart>
<namePart type="family">Jabri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maximilian</namePart>
<namePart type="family">Nickel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marilyn</namePart>
<namePart type="family">Walker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amanda</namePart>
<namePart type="family">Stent</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">New Orleans, Louisiana</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We investigate grounded sentence representations, where we train a sentence encoder to predict the image features of a given caption—i.e., we try to “imagine” how a sentence would be depicted visually—and use the resultant features as sentence representations. We examine the quality of the learned representations on a variety of standard sentence representation quality benchmarks, showing improved performance for grounded models over non-grounded ones. In addition, we thoroughly analyze the extent to which grounding contributes to improved performance, and show that the system also learns improved word embeddings.</abstract>
<identifier type="citekey">kiela-etal-2018-learning</identifier>
<identifier type="doi">10.18653/v1/N18-1038</identifier>
<location>
<url>https://aclanthology.org/N18-1038</url>
</location>
<part>
<date>2018-06</date>
<extent unit="page">
<start>408</start>
<end>418</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning Visually Grounded Sentence Representations
%A Kiela, Douwe
%A Conneau, Alexis
%A Jabri, Allan
%A Nickel, Maximilian
%Y Walker, Marilyn
%Y Ji, Heng
%Y Stent, Amanda
%S Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)
%D 2018
%8 June
%I Association for Computational Linguistics
%C New Orleans, Louisiana
%F kiela-etal-2018-learning
%X We investigate grounded sentence representations, where we train a sentence encoder to predict the image features of a given caption—i.e., we try to “imagine” how a sentence would be depicted visually—and use the resultant features as sentence representations. We examine the quality of the learned representations on a variety of standard sentence representation quality benchmarks, showing improved performance for grounded models over non-grounded ones. In addition, we thoroughly analyze the extent to which grounding contributes to improved performance, and show that the system also learns improved word embeddings.
%R 10.18653/v1/N18-1038
%U https://aclanthology.org/N18-1038
%U https://doi.org/10.18653/v1/N18-1038
%P 408-418
Markdown (Informal)
[Learning Visually Grounded Sentence Representations](https://aclanthology.org/N18-1038) (Kiela et al., NAACL 2018)
ACL
- Douwe Kiela, Alexis Conneau, Allan Jabri, and Maximilian Nickel. 2018. Learning Visually Grounded Sentence Representations. In Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers), pages 408–418, New Orleans, Louisiana. Association for Computational Linguistics.