@InProceedings{madhyastha-wang-specia:2018:BlackboxNLP,
  author    = {Madhyastha, Pranava Swaroop  and  Wang, Josiah  and  Specia, Lucia},
  title     = {End-to-end Image Captioning Exploits Distributional Similarity in Multimodal Space},
  booktitle = {Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP},
  month     = {November},
  year      = {2018},
  address   = {Brussels, Belgium},
  publisher = {Association for Computational Linguistics},
  pages     = {381--383},
  abstract  = {We hypothesize that end-to-end neural image captioning systems work seemingly well because they exploit and learn `distributional similarity' in a multimodal feature space, by mapping a test image to similar training images in this space and generating a caption from the same space. To validate our hypothesis, we focus on the `image' side of image captioning, and vary the input image representation but keep the RNN text generation model of a CNN-RNN constant. Our analysis indicates that image captioning models (i) are capable of separating structure from noisy input representations; (ii) experience virtually no significant performance loss when a high dimensional representation is compressed to a lower dimensional space; (iii) cluster images with similar visual and linguistic information together. Our experiments all point to one fact: that our distributional similarity hypothesis holds. We conclude that, regardless of the image representation, image captioning systems seem to match images and generate captions in a learned joint image-text semantic subspace.},
  url       = {http://www.aclweb.org/anthology/W18-5455}
}