@InProceedings{laokulrat-EtAl:2016:COLING,
  author    = {Laokulrat, Natsuda  and  Phan, Sang  and  Nishida, Noriki  and  Shu, Raphael  and  Ehara, Yo  and  Okazaki, Naoaki  and  Miyao, Yusuke  and  Nakayama, Hideki},
  title     = {Generating Video Description using Sequence-to-sequence Model with Temporal Attention},
  booktitle = {Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {44--52},
  abstract  = {Automatic video description generation has recently been getting attention
	after rapid advancement in image caption generation. Automatically generating
	description for a video is more challenging than for an image due to its
	temporal dynamics of frames. Most of the work relied on Recurrent Neural
	Network (RNN) and recently attentional mechanisms have also been applied to
	make the model learn to focus on some frames of the video while generating each
	word in a describing sentence.
	In this paper, we focus on a sequence-to-sequence approach with temporal
	attention mechanism. We analyze and compare the results from different
	attention model configuration. By applying the temporal attention mechanism to
	the system, we can achieve a METEOR score of 0.310 on Microsoft Video
	Description dataset, which outperformed the state-of-the-art system so far.},
  url       = {http://aclweb.org/anthology/C16-1005}
}

