@InProceedings{shekhar-EtAl:2017:Long,
  author    = {Shekhar, Ravi  and  Pezzelle, Sandro  and  Klimovich, Yauhen  and  Herbelot, Aur\'{e}lie  and  Nabi, Moin  and  Sangineto, Enver  and  Bernardi, Raffaella},
  title     = {FOIL it! Find One mismatch between Image and Language caption},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {255--265},
  abstract  = {In this paper, we aim to understand whether current language and vision (LaVi)
	models truly grasp the interaction between the two modalities. To this end, we
	propose an extension of the MS-COCO dataset, FOIL-COCO, which associates images
	with both correct and `foil' captions, that is, descriptions of the image that
	are highly similar to the original ones, but contain one single mistake (`foil
	word'). We show that current LaVi models fall into the traps of this data and
	perform badly on three tasks: a) caption  classification (correct vs. foil); b)
	foil word detection; c) foil word correction. Humans, in contrast, have
	near-perfect performance on those tasks. We demonstrate that merely utilising
	language cues is not enough to model FOIL-COCO and that it challenges the
	state-of-the-art by requiring a fine-grained understanding of the relation
	between text and image.},
  url       = {http://aclweb.org/anthology/P17-1024}
}