@InProceedings{adouane-semmar-johansson:2016:VarDial31,
  author    = {Adouane, Wafia  and  Semmar, Nasredine  and  Johansson, Richard},
  title     = {Romanized Berber and Romanized Arabic Automatic Language Identification Using Machine Learning},
  booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial3)},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {53--61},
  abstract  = {The identification of the language of text/speech input is the first step to be
	able to properly do any language-dependent natural language processing. The
	task is called Automatic Language Identification (ALI). Being a well-studied
	field since early 1960’s, various methods have been applied to many standard
	languages. The ALI standard methods require datasets for training and use
	character/word-based n-gram models. However, social media and new technologies
	have contributed to the rise of informal and minority languages on the Web. The
	state-of-the-art auto- matic language identifiers fail to properly identify
	many of them. Romanized Arabic (RA) and Romanized Berber (RB) are cases of
	these informal languages which are under-resourced. The goal of this paper is
	twofold: detect RA and RB, at a document level, as separate languages and
	distinguish between them as they coexist in North Africa. We consider the task
	as a classification problem and use supervised machine learning to solve it.
	For both languages, character-based 5-grams combined with additional lexicons
	score the best, F-score of 99.75% and 97.77% for RB and RA respectively.},
  url       = {http://aclweb.org/anthology/W16-4807}
}

