@InProceedings{anderson-EtAl:2017:EMNLP2017,
  author    = {Anderson, Peter  and  Fernando, Basura  and  Johnson, Mark  and  Gould, Stephen},
  title     = {Guided Open Vocabulary Image Captioning with Constrained Beam Search},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {936--945},
  abstract  = {Existing image captioning models do not generalize well to out-of-domain images
	containing novel scenes or objects. This limitation severely hinders the use of
	these models in real world applications dealing with images in the wild. We
	address this problem using a flexible approach that enables existing deep
	captioning architectures to take advantage of image taggers at test time,
	without re-training. Our method uses constrained beam search to force the
	inclusion of selected tag words in the output, and fixed, pretrained word
	embeddings to facilitate vocabulary expansion to previously unseen tag words.
	Using this approach we achieve state of the art results for out-of-domain
	captioning on MSCOCO (and improved results for in-domain captioning). Perhaps
	surprisingly, our results significantly outperform approaches that incorporate
	the same tag predictions into the learning algorithm. We also show that we can
	significantly improve the quality of generated ImageNet captions by leveraging
	ground-truth labels.},
  url       = {https://www.aclweb.org/anthology/D17-1098}
}

