@InProceedings{jiang-carenini-ng:2016:COLING,
  author    = {Jiang, Kailang  and  Carenini, Giuseppe  and  Ng, Raymond},
  title     = {Training Data Enrichment for Infrequent Discourse Relations},
  booktitle = {Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {2603--2614},
  abstract  = {Discourse parsing is a popular technique widely used in text understanding,
	sentiment analysis and other NLP tasks. However, for most discourse parsers,
	the performance varies significantly across different discourse relations. In
	this paper, we first validate the underfitting hypothesis, i.e., the less
	frequent a relation is in the training data, the poorer the performance on that
	relation. We then explore how to increase the number of positive training
	instances, without resorting to manually creating additional labeled data. We
	propose a training data enrichment framework that relies on co-training of two
	different discourse parsers on unlabeled documents. Importantly, we show that
	co-training alone is not sufficient. The framework requires a filtering step to
	ensure that only “good quality” unlabeled documents can be used for
	enrichment and re-training. We propose and evaluate two ways to perform the
	filtering. The first is to use an agreement score between the two parsers. The
	second is to use only the confidence score of the faster parser. Our empirical
	results show that agreement score can help to boost the performance on
	infrequent relations, and that the confidence score is a viable approximation
	of the agreement score for infrequent relations.},
  url       = {http://aclweb.org/anthology/C16-1245}
}

