@InProceedings{alharbi-gotoh:2017:INLG2017,
  author    = {Alharbi, Nouf  and  Gotoh, Yoshihiko},
  title     = {Natural Language Descriptions for Human Activities in Video Streams},
  booktitle = {Proceedings of the 10th International Conference on Natural Language Generation},
  month     = {September},
  year      = {2017},
  address   = {Santiago de Compostela, Spain},
  publisher = {Association for Computational Linguistics},
  pages     = {85--94},
  abstract  = {There has been continuous growth in the volume and ubiquity of video material.
	It has become essential to define video semantics in order to aid the
	searchability and retrieval of this data. We present a framework that produces
	textual descriptions of video, based on the visual semantic content. Detected
	action classes rendered as verbs, participant objects converted to noun
	phrases, visual properties of detected objects rendered as adjectives and
	spatial relations between objects rendered as prepositions. Further, in cases
	of zero-shot action recognition, a language model is used to infer a missing
	verb, aided by the detection of objects and scene settings. These extracted
	features are converted into textual descriptions using a template-based
	approach. The proposed video descriptions framework evaluated on the NLDHA
	dataset using ROUGE scores and human judgment evaluation.},
  url       = {http://www.aclweb.org/anthology/W17-3512}
}

