@InProceedings{mcnamee:2016:VarDial3,
  author    = {McNamee, Paul},
  title     = {Language and Dialect Discrimination Using Compression-Inspired Language Models},
  booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial3)},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {195--203},
  abstract  = {The DSL 2016 shared task continued previous evaluations from 2014 and 2015 that
	facilitated the study of automated language and dialect identification. This
	paper describes results for this year’s shared task and from several related
	experiments conducted at the Johns Hopkins University Human Language Technology
	Center of Excellence (JHU HLTCOE). Previously the HLTCOE has explored the use
	of compression-inspired language modeling for language and dialect
	identification, using news, Wikipedia, blog post, and Twitter corpora. The
	technique we have relied upon is based on prediction by partial matching (PPM),
	a state of the art text compression technique. Due to the close relationship
	between adaptive compression and language modeling, such compression techniques
	can also be applied to multi-way text classification problems, and previous
	studies have examined tasks such as authorship attribution, email spam
	detection, and topical classification. We applied our approach to the
	multi-class decision that considered each dialect or language as a possibility
	for the given shared task input line. Results for test-set A were in accord
	with our expectations, however results for test-sets B and C appear to be
	markedly worse. We had not anticipated the inclusion of multiple communications
	in differing languages in test- set B (social media) input lines, and had not
	expected the test-set C (dialectal Arabic) data to be represented phonetically
	instead of in native orthography.},
  url       = {http://aclweb.org/anthology/W16-4825}
}

