@InProceedings{gomez-EtAl:2017:VarDial,
  author    = {Gomez, Helena  and  Markov, Ilia  and  Baptista, Jorge  and  Sidorov, Grigori  and  Pinto, David},
  title     = {Discriminating between Similar Languages Using a Combination of Typed and Untyped Character N-grams and Words},
  booktitle = {Proceedings of the Fourth Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial)},
  month     = {April},
  year      = {2017},
  address   = {Valencia, Spain},
  publisher = {Association for Computational Linguistics},
  pages     = {137--145},
  abstract  = {This paper presents the cic\_ualg's system that took part in the Discriminating
	between Similar Languages (DSL) shared task, held at the VarDial 2017 Workshop.
	This year's task aims at identifying 14 languages across 6 language groups
	using a corpus of excerpts of journalistic texts. Two classification approaches
	were compared: a single-step (all languages) approach and a two-step (language
	group and then languages within the group) approach. Features exploited include
	lexical features (unigrams of words) and character n-grams. Besides traditional
	(untyped) character n-grams, we introduce typed character n-grams in the DSL
	task. Experiments were carried out with different feature representation
	methods (binary and raw term frequency), frequency threshold values, and
	machine-learning algorithms -- Support Vector Machines (SVM) and Multinomial
	Naive Bayes (MNB). Our best run in the DSL task achieved 91.46% accuracy.},
  url       = {http://www.aclweb.org/anthology/W17-1217}
}

