@InProceedings{tuggener:2017:EACLlong,
  author    = {Tuggener, Don},
  title     = {A method for in-depth comparative evaluation: How (dis)similar are outputs of pos taggers, dependency parsers and coreference resolvers really?},
  booktitle = {Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers},
  month     = {April},
  year      = {2017},
  address   = {Valencia, Spain},
  publisher = {Association for Computational Linguistics},
  pages     = {188--198},
  abstract  = {This paper proposes a generic method for the comparative evaluation of system
	outputs.  The approach is able to quantify the pairwise differences between two
	outputs and  to  unravel  in  detail  what  the  differences consist of. We
	apply our approach to three tasks in Computational Linguistics, i.e. POS
	tagging, dependency parsing, and coreference resolution.  We find that system
	outputs are more distinct than the (often) small differences in evaluation
	scores seem to suggest.},
  url       = {http://www.aclweb.org/anthology/E17-1018}
}