@InProceedings{elliott-kadar:2017:I17-1,
  author    = {Elliott, Desmond  and  K\'{a}d\'{a}r, \'{A}kos},
  title     = {Imagination Improves Multimodal Translation},
  booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers)},
  month     = {November},
  year      = {2017},
  address   = {Taipei, Taiwan},
  publisher = {Asian Federation of Natural Language Processing},
  pages     = {130--141},
  abstract  = {We decompose multimodal translation into two sub-tasks: learning to translate
	and learning visually grounded representations. In a multitask learning
	framework, translations are learned in an attention-based encoder-decoder, and
	grounded representations are learned through image representation prediction.
	Our approach improves translation performance compared to the state of the art
	on the Multi30K dataset. Furthermore, it is equally effective if we train the
	image prediction task on the external MS COCO dataset, and we find improvements
	if we train the translation model on the external News Commentary parallel
	text.},
  url       = {http://www.aclweb.org/anthology/I17-1014}
}