@InProceedings{rubioromano-EtAl:2017:VL17,
  author    = {Rubio Romano, Antonio  and  Yu, LongLong  and  Simo-Serra, Edgar  and  Moreno-Noguer, Francesc},
  title     = {Multi-Modal Fashion Product Retrieval},
  booktitle = {Proceedings of the Sixth Workshop on Vision and Language},
  month     = {April},
  year      = {2017},
  address   = {Valencia, Spain},
  publisher = {Association for Computational Linguistics},
  pages     = {43--45},
  abstract  = {Finding a product in the fashion world can be a daunting task. Everyday,
	e-commerce sites are updating with thousands of images and their associated
	metadata (textual information), deepening the problem. In this paper, we
	leverage both the images and textual metadata and propose a joint multi-modal
	embedding that maps both the text and images into a common latent space.
	Distances in the latent space correspond to similarity between products,
	allowing us to effectively perform retrieval in this latent space. We compare
	against existing approaches and show significant improvements in retrieval
	tasks on a large-scale e-commerce dataset.},
  url       = {http://www.aclweb.org/anthology/W17-2007}
}