@InProceedings{kiela-EtAl:2018:N18-1,
  author    = {Kiela, Douwe  and  Conneau, Alexis  and  Jabri, Allan  and  Nickel, Maximilian},
  title     = {Learning Visually Grounded Sentence Representations},
  booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)},
  month     = {June},
  year      = {2018},
  address   = {New Orleans, Louisiana},
  publisher = {Association for Computational Linguistics},
  pages     = {408--418},
  abstract  = {We investigate grounded sentence representations, where we train a sentence encoder to predict the image features of a given caption---i.e., we try to ``imagine'' how a sentence would be depicted visually---and use the resultant features as sentence representations. We examine the quality of the learned representations on a variety of standard sentence representation quality benchmarks, showing improved performance for grounded models over non-grounded ones. In addition, we thoroughly analyze the extent to which grounding contributes to improved performance, and show that the system also learns improved word embeddings.},
  url       = {http://www.aclweb.org/anthology/N18-1038}
}

