@InProceedings{yin-ordonez:2017:EMNLP2017,
  author    = {Yin, Xuwang  and  Ordonez, Vicente},
  title     = {Obj2Text: Generating Visually Descriptive Language from Object Layouts},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {177--187},
  abstract  = {Generating captions for images is a task that has recently received
	considerable attention. Another type of visual inputs are abstract scenes or
	object layouts where the only information provided is a set of objects and
	their locations. This type of imagery is commonly found in many applications in
	computer graphics, virtual reality, and storyboarding. We explore in this paper
	OBJ2TEXT, a sequence-to-sequence model that encodes a set of objects and their
	locations as an input sequence using an LSTM network, and decodes this
	representation using an LSTM language model. We show in our paper that this
	model despite using a sequence encoder can effectively represent complex
	spatial object-object relationships and produce descriptions that are globally
	coherent and semantically relevant. We test our approach for the task of
	describing object layouts in the MS-COCO dataset by producing sentences given
	only object annotations. We additionally show that our model combined with a
	state-of-the-art object detector can improve the accuracy of an image
	captioning model.},
  url       = {https://www.aclweb.org/anthology/D17-1017}
}