@InProceedings{alnabki-EtAl:2017:EACLlong,
  author    = {Al Nabki, Mhd Wesam  and  Fidalgo, Eduardo  and  Alegre, Enrique  and  de Paz, Ivan},
  title     = {Classifying Illegal Activities on Tor Network Based on Web Textual Contents},
  booktitle = {Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers},
  month     = {April},
  year      = {2017},
  address   = {Valencia, Spain},
  publisher = {Association for Computational Linguistics},
  pages     = {35--43},
  abstract  = {The freedom of the Deep Web offers a safe place where people can express
	themselves anonymously but they also can conduct illegal activities. In this
	paper, we present and make publicly available a new dataset for Darknet active
	domains, which we call ''Darknet Usage Text Addresses'' (DUTA). We built
	DUTA by sampling the Tor network during two months and manually labeled each
	address into 26 classes. Using DUTA, we conducted a comparison between two
	well-known text representation techniques crossed by three different supervised
	classifiers to categorize the Tor hidden services. We also fixed the pipeline
	elements and identified the aspects that have a critical influence on the
	classification results. We found that the combination of TFIDF words
	representation with Logistic Regression classifier achieves 96.6\% of 10 folds
	cross-validation accuracy and a macro F1 score of 93.7\% when classifying a
	subset of illegal activities from DUTA. The good performance of the classifier
	might support potential tools to help the authorities in the detection of these
	activities.},
  url       = {http://www.aclweb.org/anthology/E17-1004}
}

