@InProceedings{jurgens-tsvetkov-jurafsky:2017:Short,
  author    = {Jurgens, David  and  Tsvetkov, Yulia  and  Jurafsky, Dan},
  title     = {Incorporating Dialectal Variability for Socially Equitable Language Identification},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {51--57},
  abstract  = {Language identification (LID) is a critical first step for processing
	multilingual text.  Yet most LID systems are not designed to handle the
	linguistic diversity of global platforms like Twitter, where local dialects and
	rampant code-switching lead language classifiers to systematically miss
	minority dialect speakers and multilingual speakers.  We propose a new dataset
	and a character-based sequence-to-sequence model for LID designed to support
	dialectal and multilingual language varieties. Our model achieves
	state-of-the-art performance on multiple LID benchmarks.  Furthermore, in a
	case study using Twitter for health tracking,  our method substantially
	increases the availability of texts written by underrepresented populations, 
	enabling the development of "socially inclusive" NLP tools.},
  url       = {http://aclweb.org/anthology/P17-2009}
}

