@InProceedings{jang-choi-allan:2017:WNUT,
  author    = {Jang, Myungha  and  Choi, Jinho D.  and  Allan, James},
  title     = {Improving Document Clustering by Removing Unnatural Language},
  booktitle = {Proceedings of the 3rd Workshop on Noisy User-generated Text},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {122--130},
  abstract  = {Technical documents contain a fair amount of unnatural language, such as
	tables, formulas, and pseudo-code. Unnatural language can bean important factor
	of confusing existing NLP tools. This paper presents an effective method of
	distinguishing unnatural language from natural language, and evaluates the
	impact of un-natural language detection on NLP tasks such as document
	clustering.  We view this problem as an information extraction task and build a
	multiclass classification model identifying unnatural language components into
	four categories. First, we create a new annotated corpus by collecting slides
	and papers in various for-mats, PPT, PDF, and HTML, where unnatural language
	components are annotated into four categories. We then explore features
	available from plain text to build a statistical model that can handle any
	format as long as it is converted into plain text. Our experiments show that
	re-moving unnatural language components gives an absolute improvement in
	document cluster-ing by up to 15%.   Our corpus and tool are publicly available},
  url       = {http://www.aclweb.org/anthology/W17-4416}
}

