@InProceedings{krishna-satuluri-goyal:2017:LaTeCH-CLfL,
  author    = {Krishna, Amrith  and  Satuluri, Pavan Kumar  and  Goyal, Pawan},
  title     = {A Dataset for Sanskrit Word Segmentation},
  booktitle = {Proceedings of the Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature},
  month     = {August},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {105--114},
  abstract  = {The last decade saw a surge in digitisation efforts for ancient manuscripts in
	Sanskrit. Due to various linguistic peculiarities inherent to the language,
	even the preliminary tasks such as word segmentation are non-trivial in
	Sanskrit. Elegant models for Word Segmentation in Sanskrit are indispensable
	for further syntactic and semantic processing of the manuscripts. Current works
	in word segmentation for Sanskrit, though commendable in their novelty, often
	have variations in their objective and evaluation criteria. In this work, we
	set the record straight. We formally define the objectives and the requirements
	for the word segmentation task. In order to encourage research in the field and
	to alleviate the time and effort required in pre-processing, we release a
	dataset of 115,000 sentences for word segmentation. For each sentence in the
	dataset we include the input character sequence, ground truth segmentation, and
	additionally lexical and morphological information about all the phonetically
	possible segments for the given sentence. In this work, we also discuss the
	linguistic considerations made while generating the candidate space of the
	possible segments.},
  url       = {http://www.aclweb.org/anthology/W17-2214}
}

