@InProceedings{postma-izquierdobevia-vossen:2016:COLING,
  author    = {Postma, Marten  and  Izquierdo Bevia, Ruben  and  Vossen, Piek},
  title     = {More is not always better: balancing sense distributions for all-words Word Sense Disambiguation},
  booktitle = {Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {3496--3506},
  abstract  = {Current Word Sense Disambiguation systems show an extremely poor performance on
	low frequent senses, which is mainly caused by the difference in sense
	distributions between training and test data. The main focus in tackling this
	problem has been on acquiring more data or selecting a single predominant sense
	and not necessarily on the meta properties of the data itself. We demonstrate
	that these properties, such as the volume, provenance, and balancing, play an
	important role with respect to system performance. In this paper, we describe a
	set of experiments to analyze these meta properties in the framework of a
	state-of-the-art WSD system when evaluated on the SemEval-2013 English
	all-words dataset. We show that volume and provenance are indeed important, but
	that approximating the perfect balancing of the selected training data leads to
	an improvement of 21 points and exceeds state-of-the-art systems by 14 points
	while using only simple features. We therefore conclude that unsupervised
	acquisition of training data should be guided by strategies aimed at matching
	meta properties.},
  url       = {http://aclweb.org/anthology/C16-1330}
}

