@InProceedings{ganesan-tendulkar-chakraborti:2017:BioNLP17,
  author    = {Ganesan, Devi  and  Tendulkar, Ashish V.  and  Chakraborti, Sutanu},
  title     = {Protein Word Detection using Text Segmentation Techniques},
  booktitle = {BioNLP 2017},
  month     = {August},
  year      = {2017},
  address   = {Vancouver, Canada,},
  publisher = {Association for Computational Linguistics},
  pages     = {238--246},
  abstract  = {Literature in Molecular Biology is abundant with linguistic metaphors. There
	have been works in the past that attempt to draw parallels between linguistics
	and biology, driven by the fundamental premise that proteins have a language of
	their own. Since word detection is crucial to the decipherment of any  unknown
	language, we attempt to establish a problem mapping from natural language text
	to protein sequences at the level of words. Towards this end, we explore the
	use of an unsupervised text segmentation algorithm to the task of extracting
	"biological words" from protein sequences. In particular, we demonstrate the
	effectiveness of using domain knowledge to complement data driven approaches in
	the text segmentation task, as well as in its biological counterpart. We also
	propose a novel extrinsic evaluation measure for protein words through protein
	family classification.},
  url       = {http://www.aclweb.org/anthology/W17-2330}
}