@InProceedings{durrett-EtAl:2017:EMNLP2017,
  author    = {Durrett, Greg  and  Kummerfeld, Jonathan K.  and  Berg-Kirkpatrick, Taylor  and  Portnoff, Rebecca  and  Afroz, Sadia  and  McCoy, Damon  and  Levchenko, Kirill  and  Paxson, Vern},
  title     = {Identifying Products in Online Cybercrime Marketplaces: A Dataset for Fine-grained Domain Adaptation},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2598--2607},
  abstract  = {One weakness of machine-learned NLP models is that they typically perform
	poorly on out-of-domain data. In this work, we study the task of identifying
	products being bought and sold in online cybercrime forums, which exhibits
	particularly challenging cross-domain effects. We formulate a task that
	represents a hybrid of slot-filling information extraction and named entity
	recognition and annotate data from four different forums. Each of these forums
	constitutes its own "fine-grained domain" in that the forums cover different
	market sectors with different properties, even though all forums are in the
	broad domain of cybercrime. We characterize these domain differences in the
	context of a learning-based system: supervised models see decreased accuracy
	when applied to new forums, and standard techniques for semi-supervised
	learning and domain adaptation have limited effectiveness on this data, which
	suggests the need to improve these techniques. We release a dataset of 1,938
	annotated posts from across the four forums.},
  url       = {https://www.aclweb.org/anthology/D17-1275}
}

