@InProceedings{phan-EtAl:2016:COLING,
  author    = {Phan, Sang  and  Miyao, Yusuke  and  Le, Duy-Dinh  and  Satoh, Shin'ichi},
  title     = {Video Event Detection by Exploiting Word Dependencies from Image Captions},
  booktitle = {Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {3318--3327},
  abstract  = {Video event detection is a challenging problem in information and multimedia
	retrieval. Different from single action detection, event detection requires a
	richer level of semantic information from video. In order to overcome this
	challenge, existing solutions often represent videos using high level features
	such as concepts. However, concept-based representation can be confusing
	because it does not encode the relationship between concepts. This issue can be
	addressed by exploiting the co-occurrences of the concepts, however, it often
	leads to a very huge number of possible combinations. In this paper, we propose
	a new approach to obtain the relationship between concepts by exploiting the
	syntactic dependencies between words in the image captions. The main advantage
	of this approach is that it significantly reduces the number of informative
	combinations between concepts. We conduct extensive experiments to analyze the
	effectiveness of using the new dependency representation for event detection on
	two large-scale TRECVID Multimedia Event Detection 2013 and 2014 datasets.
	Experimental results show that i) Dependency features are more discriminative
	than concept-based features. ii) Dependency features can be combined with our
	current event detection system to further improve the performance. For
	instance, the relative improvement can be as far as 8.6\% on the MEDTEST14 10Ex
	setting.},
  url       = {http://aclweb.org/anthology/C16-1313}
}

