@InProceedings{choshen-abend:2018:Long2,
  author    = {Choshen, Leshem  and  Abend, Omri},
  title     = {Automatic Metric Validation for Grammatical Error Correction},
  booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2018},
  address   = {Melbourne, Australia},
  publisher = {Association for Computational Linguistics},
  pages     = {1372--1382},
  abstract  = {Metric validation in Grammatical Error Correction (GEC) is currently done by observing the correlation between human and metric-induced rankings. However, such correlation studies are costly, methodologically troublesome, and suffer from low inter-rater agreement. We propose \maege, an automatic methodology for GEC metric validation, that overcomes many of the difficulties in the existing methodology. Experiments with \maege\ shed a new light on metric quality, showing for example that the standard $M\^{}2$ metric fares poorly on corpus-level ranking. Moreover, we use \maege\ to perform a detailed analysis of metric behavior, showing that some types of valid edits are consistently penalized by existing metrics.},
  url       = {http://www.aclweb.org/anthology/P18-1127}
}