@InProceedings{sajjad-EtAl:2017:Short,
  author    = {Sajjad, Hassan  and  Dalvi, Fahim  and  Durrani, Nadir  and  Abdelali, Ahmed  and  Belinkov, Yonatan  and  Vogel, Stephan},
  title     = {Challenging Language-Dependent Segmentation for Arabic: An Application to Machine Translation and Part-of-Speech Tagging},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {601--607},
  abstract  = {Word segmentation plays a pivotal role in improving any Arabic NLP application.
	Therefore, a lot of research has been spent in improving its accuracy.
	Off-the-shelf tools, however, are: i) complicated to use and ii) domain/dialect
	dependent. We explore three language-independent alternatives to morphological
	segmentation us- ing: i) data-driven sub-word units, ii) characters as a unit
	of learning, and iii) word embeddings learned using a character CNN
	(Convolution Neural Network). On the tasks of Machine Translation and POS
	tagging, we found these methods to achieve close to, and occasionally surpass
	state-of-the-art performance. In our analysis, we show that a neural machine
	translation system is sensitive to the ratio of source and target tokens, and a
	ratio close to 1 or greater, gives optimal performance.},
  url       = {http://aclweb.org/anthology/P17-2095}
}

