@InProceedings{ljubevsic-fivser-erjavec:2017:NLPandCSS,
  author    = {Ljube\v{s}i\'{c}, Nikola  and  Fi\v{s}er, Darja  and  Erjavec, Toma\v{z}},
  title     = {Language-independent Gender Prediction on Twitter},
  booktitle = {Proceedings of the Second Workshop on NLP and Computational Social Science},
  month     = {August},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1--6},
  abstract  = {In this paper we present a set of experiments and analyses on predicting the
	gender of Twitter users based on language-independent features extracted either
	from the text or the metadata of users' tweets. We perform our experiments on
	the TwiSty dataset containing manual gender annotations for users speaking six
	different languages. Our classification results show that, while the prediction
	model based on language-independent features performs worse than the
	bag-of-words model when training and testing on the same language, it regularly
	outperforms the bag-of-words model when applied to different languages, showing
	very stable results across various languages. Finally we perform a comparative
	analysis of feature effect sizes across the six languages and show that
	differences in our features correspond to cultural distances.},
  url       = {http://www.aclweb.org/anthology/W17-2901}
}

