@InProceedings{pasunuru-bansal:2017:Long,
  author    = {Pasunuru, Ramakanth  and  Bansal, Mohit},
  title     = {Multi-Task Video Captioning with Video and Entailment Generation},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {1273--1283},
  abstract  = {Video captioning, the task of describing the content of a video, has seen some
	promising improvements in recent years with sequence-to-sequence models, but
	accurately learning the temporal and logical dynamics involved in the task
	still remains a challenge, especially given the lack of sufficient annotated
	data. We improve video captioning by sharing knowledge with two related
	directed-generation tasks: a temporally-directed unsupervised video prediction
	task to learn richer context-aware video encoder representations, and a
	logically-directed language entailment generation task to learn better
	video-entailing caption decoder representations. For this, we present a
	many-to-many multi-task learning model that shares parameters across the
	encoders and decoders of the three tasks. We achieve significant improvements
	and the new state-of-the-art on several standard video captioning datasets
	using diverse automatic and human evaluations. We also show mutual multi-task
	improvements on the entailment generation task.},
  url       = {http://aclweb.org/anthology/P17-1117}
}

