@InProceedings{reimers-beyer-gurevych:2016:COLING,
  author    = {Reimers, Nils  and  Beyer, Philip  and  Gurevych, Iryna},
  title     = {Task-Oriented Intrinsic Evaluation of Semantic Textual Similarity},
  booktitle = {Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {87--96},
  abstract  = {Semantic Textual Similarity (STS) is a foundational NLP task and can be used in
	a wide range of tasks. To determine the STS of two texts, hundreds of different
	STS systems exist, however, for an NLP system designer, it is hard to decide
	which system is the best one. To answer this question, an intrinsic evaluation
	of the STS systems is conducted by comparing the output of the system to human
	judgments on semantic similarity. The comparison is usually done using Pearson
	correlation. In this work, we show that relying on intrinsic evaluations with
	Pearson correlation can be misleading. In three common STS based tasks we could
	observe that the Pearson correlation was especially ill-suited to detect the
	best STS system for the task and other evaluation measures were much better
	suited. In this work we define how the validity of an intrinsic evaluation can
	be assessed and compare different intrinsic evaluation methods. Understanding
	of the properties of the targeted task is crucial and we propose a framework
	for conducting the intrinsic evaluation which takes the properties of the
	targeted task into account.
	Author{2}{Affiliation}},
  url       = {http://aclweb.org/anthology/C16-1009}
}

