@InProceedings{alshutayri-EtAl:2016:VarDial3,
  author    = {Alshutayri, Areej  and  Atwell, Eric  and  Alosaimy, Abdulrahman  and  Dickins, James  and  Ingleby, Michael  and  Watson, Janet},
  title     = {Arabic Language WEKA-Based Dialect Classifier for Arabic Automatic Speech Recognition Transcripts},
  booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial3)},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {204--211},
  abstract  = {This paper describes an Arabic dialect identification system which we developed
	for the Discriminating Similar Languages (DSL) 2016 shared task. We classified
	Arabic dialects by using Waikato Environment for Knowledge Analysis (WEKA) data
	analytic tool which contains many alternative filters and classifiers for
	machine learning. We experimented with several classifiers and the best
	accuracy was achieved using the Sequential Minimal Optimization (SMO) algorithm
	for training and testing process set to three different feature-sets for each
	testing process. Our approach achieved an accuracy equal to 42.85% which is
	considerably worse in comparison to the evaluation scores on the training set
	of 80-90% and with training set “60:40” percentage split which achieved
	accuracy around 50%. We observed that Buckwalter transcripts from the Saarland
	Automatic Speech Recognition (ASR) system are given without short vowels,
	though the Buckwalter system has notation for these. We elaborate such
	observations, describe our methods and analyse the training dataset.},
  url       = {http://aclweb.org/anthology/W16-4826}
}

