@InProceedings{blodgett-wei-oconnor:2017:WNUT,
  author    = {Blodgett, Su Lin  and  Wei, Johnny  and  O'Connor, Brendan},
  title     = {A Dataset and Classifier for Recognizing Social Media English},
  booktitle = {Proceedings of the 3rd Workshop on Noisy User-generated Text},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {56--61},
  abstract  = {While language identification works well on standard texts, it performs much
	worse on social media language, in particular dialectal language---even for
	English. First, to support work on English language identification, we
	contribute a new dataset of tweets annotated for English versus non-English,
	with attention to ambiguity, code-switching, and automatic generation issues.
	It is randomly sampled from all public messages, avoiding biases towards
	pre-existing language classifiers. Second, we find that a demographic language
	model---which identifies messages with language similar to that used by several
	U.S. ethnic populations on Twitter---can be used to improve English language
	identification performance when combined with a traditional supervised language
	identifier. It increases recall with almost no loss of precision, including,
	surprisingly, for English messages written by non-U.S. authors.
	Our dataset and identifier ensemble are available online.},
  url       = {http://www.aclweb.org/anthology/W17-4408}
}