@InProceedings{nuranivenkitasubramanian-tuytelaars-moens:2017:VL17,
  author    = {Nurani Venkitasubramanian, Aparna  and  Tuytelaars, Tinne  and  Moens, Marie-Francine},
  title     = {Learning to Recognize Animals by Watching Documentaries: Using Subtitles as Weak Supervision},
  booktitle = {Proceedings of the Sixth Workshop on Vision and Language},
  month     = {April},
  year      = {2017},
  address   = {Valencia, Spain},
  publisher = {Association for Computational Linguistics},
  pages     = {21--30},
  abstract  = {We investigate animal recognition models learned from wildlife video
	documentaries by using the weak supervision of the textual subtitles. This is a
	particularly challenging setting, since i) the animals occur in their natural
	habitat and are often largely occluded and ii) subtitles are to a large degree
	complementary to the visual content, providing a very weak supervisory signal.
	This is in contrast to most work on integrated vision and language in the
	literature, where textual descriptions are tightly linked to the image content,
	and often generated in a curated fashion for the task at hand. In particular,
	we investigate different image representations and models, including a support
	vector machine on top of activations of a pretrained convolutional neural
	network, as well as a Naive Bayes framework on a 'bag-of-activations' image
	representation, where each element of the bag is considered separately. This
	representation allows key components in the image to be isolated, in spite of
	largely varying backgrounds and image clutter, without an object detection or
	image segmentation step. The methods are evaluated based on how well they
	transfer to unseen camera-trap images captured across diverse topographical
	regions under different environmental conditions and illumination settings,
	involving a large domain shift.},
  url       = {http://www.aclweb.org/anthology/W17-2003}
}

