@InProceedings{williams-dagli:2017:VarDial,
  author    = {Williams, Jennifer  and  Dagli, Charlie},
  title     = {Twitter Language Identification Of Similar Languages And Dialects Without Ground Truth},
  booktitle = {Proceedings of the Fourth Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial)},
  month     = {April},
  year      = {2017},
  address   = {Valencia, Spain},
  publisher = {Association for Computational Linguistics},
  pages     = {73--83},
  abstract  = {We present a new method to bootstrap filter Twitter language ID labels in our
	dataset for automatic language identification (LID). Our method combines
	geo-location, original Twitter LID labels, and Amazon Mechanical Turk to
	resolve missing and unreliable labels. We are the first to compare LID
	classification performance using the MIRA algorithm and langid.py. We show
	classifier performance on different versions of our dataset with high accuracy
	using only Twitter data, without ground truth, and very few training examples.
	We also show how Platt Scaling can be use to calibrate MIRA classifier output
	values into a probability distribution over candidate classes, making the
	output more intuitive. Our method allows for fine-grained distinctions between
	similar languages and dialects and allows us to rediscover the language
	composition of our Twitter dataset.},
  url       = {http://www.aclweb.org/anthology/W17-1209}
}

