@InProceedings{zweigenbaum-grouin-lavergne:2016:BioTxtM2016,
  author    = {Zweigenbaum, Pierre  and  Grouin, Cyril  and  Lavergne, Thomas},
  title     = {Supervised classification of end-of-lines in clinical text with no manual annotation},
  booktitle = {Proceedings of the Fifth Workshop on Building and Evaluating Resources for Biomedical Text Mining (BioTxtM2016)},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {80--88},
  abstract  = {In some plain text documents, end-of-line marks may or may not mark the
	boundary of a text unit (e.g., of a paragraph).  This vexing problem is likely
	to impact subsequent natural language processing components, but is seldom
	addressed in the literature.  We propose a method which uses no manual
	annotation to classify whether end-of-lines must actually be seen as simple
	spaces (soft line breaks) or as true text unit boundaries. This method, which
	includes self-training and co-training steps based on token and line length
	features, achieves 0.943 F-measure on a corpus of short e-books with controlled
	format, F=0.904 on a random sample of 24 clinical texts with soft line breaks,
	and F=0.898 on a larger set of mixed clinical texts which may or may not
	contain soft line breaks, a fairly high value for a method with no manual
	annotation.},
  url       = {http://aclweb.org/anthology/W16-5109}
}

