@InProceedings{jia-liang:2017:EMNLP2017,
  author    = {Jia, Robin  and  Liang, Percy},
  title     = {Adversarial Examples for Evaluating Reading Comprehension Systems},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2021--2031},
  abstract  = {Standard accuracy metrics indicate that 
	reading comprehension systems are making rapid progress,
	but the extent to which these systems truly understand language remains
	unclear.
	To reward systems with real language understanding abilities,
	we propose an adversarial evaluation scheme for the Stanford
	Question Answering Dataset (SQuAD). 
	Our method tests whether systems can answer questions
	about paragraphs that contain adversarially inserted sentences,
	which are automatically generated to distract computer systems
	without changing the correct answer or misleading humans.
	In this adversarial setting,
	the accuracy of sixteen published models
	drops from an average of $75\%$ F1 score to $36\%$;
	when the adversary is allowed to add ungrammatical sequences of words,
	average accuracy on four models decreases further to $7\%$.
	We hope our insights will motivate
	the development of new models that
	understand language more precisely.},
  url       = {https://www.aclweb.org/anthology/D17-1215}
}