@InProceedings{birmingham-muscat:2017:VL17,
  author    = {Birmingham, Brandon  and  Muscat, Adrian},
  title     = {The Use of Object Labels and Spatial Prepositions as Keywords in a Web-Retrieval-Based Image Caption Generation System},
  booktitle = {Proceedings of the Sixth Workshop on Vision and Language},
  month     = {April},
  year      = {2017},
  address   = {Valencia, Spain},
  publisher = {Association for Computational Linguistics},
  pages     = {11--20},
  abstract  = {In this paper, a retrieval-based caption generation system that searches the
	web for suitable image descriptions is studied. Google's reverse image search
	is used to find potentially relevant web multimedia content for query images.
	Sentences are extracted from web pages and the likelihood of the descriptions
	is
	computed to select one sentence from the retrieved text documents. The search
	mechanism is modified to replace the caption generated by Google with a caption
	composed of labels and spatial prepositions as part of the query's text
	alongside the image. The object labels are obtained using an off-the-shelf
	R-CNN and a machine learning model is developed to predict the prepositions.
	The effect on the caption generation system performance when using the
	generated text is investigated. Both human evaluations and automatic metrics
	are used to evaluate the retrieved descriptions. Results show that the
	web-retrieval-based approach performed better when describing single-object
	images with sentences extracted from stock photography websites. On the other
	hand, images with two image objects were better described with
	template-generated sentences composed of object labels and prepositions.},
  url       = {http://www.aclweb.org/anthology/W17-2002}
}