@InProceedings{ionescu-popescu:2016:VarDial3,
  author    = {Ionescu, Radu Tudor  and  Popescu, Marius},
  title     = {UnibucKernel: An Approach for Arabic Dialect Identification Based on Multiple String Kernels},
  booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial3)},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {135--144},
  abstract  = {The most common approach in text mining classification tasks is to rely on
	features like words, part-of-speech tags, stems, or some other high-level
	linguistic features. Unlike the common approach, we present a method that uses
	only character p-grams (also known as n-grams) as features for the Arabic
	Dialect Identification (ADI)
	Closed Shared Task of the DSL 2016 Challenge. The proposed approach combines
	several string kernels using multiple kernel learning. In the learning stage,
	we try both Kernel Discriminant Analysis (KDA) and Kernel Ridge Regression
	(KRR), and we choose KDA as it gives better results in a 10-fold
	cross-validation carried out on the training set. Our approach is shallow and
	simple, but the empirical results obtained in the ADI Shared Task prove that it
	achieves very good results. Indeed, we ranked on the second place with an
	accuracy of 50.91% and a weighted F1 score of 51.31%. We also present improved
	results in this paper, which we obtained after the competition ended. Simply by
	adding more regularization into our model to make it more suitable for test
	data that comes from a different distribution than training data, we obtain an
	accuracy of 51.82% and a weighted F1 score of 52.18%. Furthermore, the proposed
	approach has an important advantage in that it is language independent and
	linguistic theory neutral, as it does not require any NLP tools.},
  url       = {http://aclweb.org/anthology/W16-4818}
}

