@InProceedings{verhoeven-vskrjanec-pollak:2017:BSNLP,
  author    = {Verhoeven, Ben  and  \v{S}krjanec, Iza  and  Pollak, Senja},
  title     = {Gender Profiling for Slovene Twitter communication: the Influence of Gender Marking, Content and Style},
  booktitle = {Proceedings of the 6th Workshop on Balto-Slavic Natural Language Processing},
  month     = {April},
  year      = {2017},
  address   = {Valencia, Spain},
  publisher = {Association for Computational Linguistics},
  pages     = {119--125},
  abstract  = {We present results of the first gender classification experiments on Slovene
	text to our knowledge. Inspired by the TwiSty corpus and experiments (Verhoeven
	et al., 2016), we employed the Janes corpus (Erjavec et al., 2016) and its
	gender annotations to perform gender classification experiments on Twitter text
	comparing a token-based and a lemma-based approach. We find that the
	token-based approach (92.6% accuracy), containing gender markings related to
	the author, outperforms the lemma-based approach by about 5%. Especially in the
	lemmatized version, we also observe stylistic and content-based differences in
	writing between men (e.g. more profane language, numerals and beer mentions)
	and women (e.g. more pronouns, emoticons and character flooding). Many of our
	findings corroborate previous research on other languages.},
  url       = {http://www.aclweb.org/anthology/W17-1418}
}

