@InProceedings{gella-EtAl:2017:EMNLP2017,
  author    = {Gella, Spandana  and  Sennrich, Rico  and  Keller, Frank  and  Lapata, Mirella},
  title     = {Image Pivoting for Learning Multilingual Multimodal Representations},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2839--2845},
  abstract  = {In this paper we propose a model to
	learn multimodal multilingual representations
	for matching images and sentences
	in different languages, with the aim of
	advancing multilingual versions of image
	search and image understanding. Our
	model learns a common representation for
	images and their descriptions in two different
	languages (which need not be parallel)
	by considering the image as a pivot between
	two languages. We introduce a new
	pairwise ranking loss function which can
	handle both symmetric and asymmetric
	similarity between the two modalities. We
	evaluate our models on image-description
	ranking for German and English, and on
	semantic textual similarity of image descriptions
	in English. In both cases we
	achieve state-of-the-art performance.},
  url       = {https://www.aclweb.org/anthology/D17-1303}
}

