@inproceedings{rubio-romano-etal-2017-multi,
title = "Multi-Modal Fashion Product Retrieval",
author = "Rubio Romano, Antonio and
Yu, LongLong and
Simo-Serra, Edgar and
Moreno-Noguer, Francesc",
editor = "Belz, Anya and
Erdem, Erkut and
Pastra, Katerina and
Mikolajczyk, Krystian",
booktitle = "Proceedings of the Sixth Workshop on Vision and Language",
month = apr,
year = "2017",
address = "Valencia, Spain",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W17-2007",
doi = "10.18653/v1/W17-2007",
pages = "43--45",
abstract = "Finding a product in the fashion world can be a daunting task. Everyday, e-commerce sites are updating with thousands of images and their associated metadata (textual information), deepening the problem. In this paper, we leverage both the images and textual metadata and propose a joint multi-modal embedding that maps both the text and images into a common latent space. Distances in the latent space correspond to similarity between products, allowing us to effectively perform retrieval in this latent space. We compare against existing approaches and show significant improvements in retrieval tasks on a large-scale e-commerce dataset.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rubio-romano-etal-2017-multi">
<titleInfo>
<title>Multi-Modal Fashion Product Retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Rubio Romano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">LongLong</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edgar</namePart>
<namePart type="family">Simo-Serra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francesc</namePart>
<namePart type="family">Moreno-Noguer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Workshop on Vision and Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erkut</namePart>
<namePart type="family">Erdem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katerina</namePart>
<namePart type="family">Pastra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Krystian</namePart>
<namePart type="family">Mikolajczyk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Valencia, Spain</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Finding a product in the fashion world can be a daunting task. Everyday, e-commerce sites are updating with thousands of images and their associated metadata (textual information), deepening the problem. In this paper, we leverage both the images and textual metadata and propose a joint multi-modal embedding that maps both the text and images into a common latent space. Distances in the latent space correspond to similarity between products, allowing us to effectively perform retrieval in this latent space. We compare against existing approaches and show significant improvements in retrieval tasks on a large-scale e-commerce dataset.</abstract>
<identifier type="citekey">rubio-romano-etal-2017-multi</identifier>
<identifier type="doi">10.18653/v1/W17-2007</identifier>
<location>
<url>https://aclanthology.org/W17-2007</url>
</location>
<part>
<date>2017-04</date>
<extent unit="page">
<start>43</start>
<end>45</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multi-Modal Fashion Product Retrieval
%A Rubio Romano, Antonio
%A Yu, LongLong
%A Simo-Serra, Edgar
%A Moreno-Noguer, Francesc
%Y Belz, Anya
%Y Erdem, Erkut
%Y Pastra, Katerina
%Y Mikolajczyk, Krystian
%S Proceedings of the Sixth Workshop on Vision and Language
%D 2017
%8 April
%I Association for Computational Linguistics
%C Valencia, Spain
%F rubio-romano-etal-2017-multi
%X Finding a product in the fashion world can be a daunting task. Everyday, e-commerce sites are updating with thousands of images and their associated metadata (textual information), deepening the problem. In this paper, we leverage both the images and textual metadata and propose a joint multi-modal embedding that maps both the text and images into a common latent space. Distances in the latent space correspond to similarity between products, allowing us to effectively perform retrieval in this latent space. We compare against existing approaches and show significant improvements in retrieval tasks on a large-scale e-commerce dataset.
%R 10.18653/v1/W17-2007
%U https://aclanthology.org/W17-2007
%U https://doi.org/10.18653/v1/W17-2007
%P 43-45
Markdown (Informal)
[Multi-Modal Fashion Product Retrieval](https://aclanthology.org/W17-2007) (Rubio Romano et al., VL 2017)
ACL
- Antonio Rubio Romano, LongLong Yu, Edgar Simo-Serra, and Francesc Moreno-Noguer. 2017. Multi-Modal Fashion Product Retrieval. In Proceedings of the Sixth Workshop on Vision and Language, pages 43–45, Valencia, Spain. Association for Computational Linguistics.