@inproceedings{madhyastha-etal-2018-end,
title = "End-to-end Image Captioning Exploits Distributional Similarity in Multimodal Space",
author = "Madhyastha, Pranava Swaroop and
Wang, Josiah and
Specia, Lucia",
editor = "Linzen, Tal and
Chrupa{\l}a, Grzegorz and
Alishahi, Afra",
booktitle = "Proceedings of the 2018 {EMNLP} Workshop {B}lackbox{NLP}: Analyzing and Interpreting Neural Networks for {NLP}",
month = nov,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-5455",
doi = "10.18653/v1/W18-5455",
pages = "381--383",
abstract = "We hypothesize that end-to-end neural image captioning systems work seemingly well because they exploit and learn {`}distributional similarity{'} in a multimodal feature space, by mapping a test image to similar training images in this space and generating a caption from the same space. To validate our hypothesis, we focus on the {`}image{'} side of image captioning, and vary the input image representation but keep the RNN text generation model of a CNN-RNN constant. Our analysis indicates that image captioning models (i) are capable of separating structure from noisy input representations; (ii) experience virtually no significant performance loss when a high dimensional representation is compressed to a lower dimensional space; (iii) cluster images with similar visual and linguistic information together. Our experiments all point to one fact: that our distributional similarity hypothesis holds. We conclude that, regardless of the image representation, image captioning systems seem to match images and generate captions in a learned joint image-text semantic subspace.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="madhyastha-etal-2018-end">
<titleInfo>
<title>End-to-end Image Captioning Exploits Distributional Similarity in Multimodal Space</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pranava</namePart>
<namePart type="given">Swaroop</namePart>
<namePart type="family">Madhyastha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Josiah</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tal</namePart>
<namePart type="family">Linzen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Grzegorz</namePart>
<namePart type="family">Chrupała</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Afra</namePart>
<namePart type="family">Alishahi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Brussels, Belgium</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We hypothesize that end-to-end neural image captioning systems work seemingly well because they exploit and learn ‘distributional similarity’ in a multimodal feature space, by mapping a test image to similar training images in this space and generating a caption from the same space. To validate our hypothesis, we focus on the ‘image’ side of image captioning, and vary the input image representation but keep the RNN text generation model of a CNN-RNN constant. Our analysis indicates that image captioning models (i) are capable of separating structure from noisy input representations; (ii) experience virtually no significant performance loss when a high dimensional representation is compressed to a lower dimensional space; (iii) cluster images with similar visual and linguistic information together. Our experiments all point to one fact: that our distributional similarity hypothesis holds. We conclude that, regardless of the image representation, image captioning systems seem to match images and generate captions in a learned joint image-text semantic subspace.</abstract>
<identifier type="citekey">madhyastha-etal-2018-end</identifier>
<identifier type="doi">10.18653/v1/W18-5455</identifier>
<location>
<url>https://aclanthology.org/W18-5455</url>
</location>
<part>
<date>2018-11</date>
<extent unit="page">
<start>381</start>
<end>383</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T End-to-end Image Captioning Exploits Distributional Similarity in Multimodal Space
%A Madhyastha, Pranava Swaroop
%A Wang, Josiah
%A Specia, Lucia
%Y Linzen, Tal
%Y Chrupała, Grzegorz
%Y Alishahi, Afra
%S Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP
%D 2018
%8 November
%I Association for Computational Linguistics
%C Brussels, Belgium
%F madhyastha-etal-2018-end
%X We hypothesize that end-to-end neural image captioning systems work seemingly well because they exploit and learn ‘distributional similarity’ in a multimodal feature space, by mapping a test image to similar training images in this space and generating a caption from the same space. To validate our hypothesis, we focus on the ‘image’ side of image captioning, and vary the input image representation but keep the RNN text generation model of a CNN-RNN constant. Our analysis indicates that image captioning models (i) are capable of separating structure from noisy input representations; (ii) experience virtually no significant performance loss when a high dimensional representation is compressed to a lower dimensional space; (iii) cluster images with similar visual and linguistic information together. Our experiments all point to one fact: that our distributional similarity hypothesis holds. We conclude that, regardless of the image representation, image captioning systems seem to match images and generate captions in a learned joint image-text semantic subspace.
%R 10.18653/v1/W18-5455
%U https://aclanthology.org/W18-5455
%U https://doi.org/10.18653/v1/W18-5455
%P 381-383
Markdown (Informal)
[End-to-end Image Captioning Exploits Distributional Similarity in Multimodal Space](https://aclanthology.org/W18-5455) (Madhyastha et al., EMNLP 2018)
ACL