@InProceedings{das-EtAl:2017:EACLlong2,
  author    = {Das, Pradipto  and  Xia, Yandi  and  Levine, Aaron  and  Di Fabbrizio, Giuseppe  and  Datta, Ankur},
  title     = {Web-Scale Language-Independent Cataloging of Noisy Product Listings for E-Commerce},
  booktitle = {Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers},
  month     = {April},
  year      = {2017},
  address   = {Valencia, Spain},
  publisher = {Association for Computational Linguistics},
  pages     = {969--979},
  abstract  = {The cataloging of product listings through taxonomy categorization is a
	fundamental problem for any e-commerce marketplace, with applications ranging
	from personalized search recommendations to query understanding.
	However, manual and rule based approaches to categorization are not scalable. 
	In this paper, we compare several classifiers for categorizing listings in both
	English and Japanese product catalogs. 
	We show empirically that a combination of words from product titles,
	navigational breadcrumbs, and list prices, when available, improves results
	significantly.
	We outline a novel method using correspondence topic models and a lightweight
	manual process to reduce noise from mis-labeled data in the training set.
	We contrast linear models, gradient boosted trees (GBTs) and convolutional
	neural networks (CNNs), and show that GBTs and CNNs yield the highest gains in
	error reduction.
	Finally, we show GBTs applied in a language-agnostic way on a large-scale
	Japanese e-commerce dataset have improved taxonomy categorization performance
	over current state-of-the-art based on deep belief network models.
	Author{3}{Affiliation}},
  url       = {http://www.aclweb.org/anthology/E17-1091}
}

