@InProceedings{adouane-EtAl:2016:VarDial3,
  author    = {Adouane, Wafia  and  Semmar, Nasredine  and  Johansson, Richard  and  Bobicev, Victoria},
  title     = {Automatic Detection of Arabicized Berber and Arabic Varieties},
  booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial3)},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {63--72},
  abstract  = {Automatic Language Identification (ALI) is the detection of the natural
	language of an input text by a machine. It is the first necessary step to do
	any language-dependent natural language pro- cessing task. Various methods have
	been successfully applied to a wide range of languages, and the
	state-of-the-art automatic language identifiers are mainly based on character
	n-gram models trained on huge corpora. However, there are many languages which
	are not yet automatically pro- cessed, for instance minority and informal
	languages. Many of these languages are only spoken and do not exist in a
	written format. Social media platforms and new technologies have facili- tated
	the emergence of written format for these spoken languages based on
	pronunciation. The latter are not well represented on the Web, commonly
	referred to as under-resourced languages, and the current available ALI tools
	fail to properly recognize them. In this paper, we revisit the problem of ALI
	with the focus on Arabicized Berber and dialectal Arabic short texts. We intro-
	duce new resources and evaluate the existing methods. The results show that
	machine learning models combined with lexicons are well suited for detecting
	Arabicized Berber and different Arabic varieties and distinguishing between
	them, giving a macro-average F-score of 92.94%.},
  url       = {http://aclweb.org/anthology/W16-4809}
}

