@InProceedings{bestgen:2017:VarDial,
  author    = {Bestgen, Yves},
  title     = {Improving the Character Ngram Model for the DSL Task with BM25 Weighting and Less Frequently Used Feature Sets},
  booktitle = {Proceedings of the Fourth Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial)},
  month     = {April},
  year      = {2017},
  address   = {Valencia, Spain},
  publisher = {Association for Computational Linguistics},
  pages     = {115--123},
  abstract  = {This paper describes the system developed by the Centre for English Corpus
	Linguistics (CECL) to discriminating similar languages, language varieties and
	dialects. Based on a SVM with character and POStag n-grams as features and the
	BM25 weighting scheme, it achieved 92.7\% accuracy in the Discriminating
	between Similar Languages (DSL) task, ranking first among eleven systems but
	with a lead over the next three teams of only 0.2\%. A simpler version of the
	system ranked second in the German Dialect Identification (GDI) task thanks to
	several ad hoc postprocessing steps. Complementary analyses carried out by a
	cross-validation procedure suggest that the BM25 weighting scheme could be
	competitive in this type of tasks, at least in comparison with the sublinear
	TF-IDF. POStag n-grams also improved the system performance.},
  url       = {http://www.aclweb.org/anthology/W17-1214}
}

