@InProceedings{zhuang-EtAl:2017:EMNLP2017,
  author    = {Zhuang, Honglei  and  Wang, Chi  and  Tao, Fangbo  and  Kaplan, Lance  and  Han, Jiawei},
  title     = {Identifying Semantically Deviating Outlier Documents},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2748--2757},
  abstract  = {A document outlier is a document that substantially deviates in semantics from
	the majority ones in a corpus.              Automatic identification of document
	outliers
	can be valuable in many applications, such as screening health records for
	medical mistakes.  In this paper, we study the problem of mining semantically
	deviating document outliers in a given corpus.              We develop a generative
	model
	to identify frequent and characteristic semantic regions in the word embedding
	space to represent the given corpus, and a robust outlierness measure which is
	resistant to noisy content in documents.  Experiments conducted on two
	real-world textual data sets show that our method can achieve an up to 135%
	improvement over baselines in terms of recall at top-1% of the outlier ranking.},
  url       = {https://www.aclweb.org/anthology/D17-1291}
}

