@InProceedings{mostafazadeh-EtAl:2017:I17-1,
  author    = {Mostafazadeh, Nasrin  and  Brockett, Chris  and  Dolan, Bill  and  Galley, Michel  and  Gao, Jianfeng  and  Spithourakis, Georgios  and  Vanderwende, Lucy},
  title     = {Image-Grounded Conversations: Multimodal Context for Natural Question and Response Generation},
  booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers)},
  month     = {November},
  year      = {2017},
  address   = {Taipei, Taiwan},
  publisher = {Asian Federation of Natural Language Processing},
  pages     = {462--472},
  abstract  = {The popularity of image sharing on social media and the engagement it creates
	between users reﬂect the important role that visual context plays in
	everyday conversations. We present a novel task, Image Grounded Conversations
	(IGC), in which natural-sounding conversations are generated about a shared
	image. To benchmark progress, we introduce a new multiple reference dataset of
	crowd-sourced, event-centric conversations on images. IGC falls on the
	continuum between chit-chat and goal-directed conversation models, where visual
	grounding constrains the topic of conversation to event-driven utterances.
	Experiments with models trained on social media data show that the combination
	of visual and textual context enhances the quality of generated conversational
	turns. In human evaluation, the gap between human performance and that of both
	neural and retrieval architectures suggests that multi-modal IGC presents an
	interesting challenge for dialog research.},
  url       = {http://www.aclweb.org/anthology/I17-1047}
}