@inproceedings{nurani-venkitasubramanian-etal-2017-learning,
title = "Learning to Recognize Animals by Watching Documentaries: Using Subtitles as Weak Supervision",
author = "Nurani Venkitasubramanian, Aparna and
Tuytelaars, Tinne and
Moens, Marie-Francine",
editor = "Belz, Anya and
Erdem, Erkut and
Pastra, Katerina and
Mikolajczyk, Krystian",
booktitle = "Proceedings of the Sixth Workshop on Vision and Language",
month = apr,
year = "2017",
address = "Valencia, Spain",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W17-2003",
doi = "10.18653/v1/W17-2003",
pages = "21--30",
abstract = "We investigate animal recognition models learned from wildlife video documentaries by using the weak supervision of the textual subtitles. This is a particularly challenging setting, since i) the animals occur in their natural habitat and are often largely occluded and ii) subtitles are to a large degree complementary to the visual content, providing a very weak supervisory signal. This is in contrast to most work on integrated vision and language in the literature, where textual descriptions are tightly linked to the image content, and often generated in a curated fashion for the task at hand. In particular, we investigate different image representations and models, including a support vector machine on top of activations of a pretrained convolutional neural network, as well as a Naive Bayes framework on a {`}bag-of-activations{'} image representation, where each element of the bag is considered separately. This representation allows key components in the image to be isolated, in spite of largely varying backgrounds and image clutter, without an object detection or image segmentation step. The methods are evaluated based on how well they transfer to unseen camera-trap images captured across diverse topographical regions under different environmental conditions and illumination settings, involving a large domain shift.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nurani-venkitasubramanian-etal-2017-learning">
<titleInfo>
<title>Learning to Recognize Animals by Watching Documentaries: Using Subtitles as Weak Supervision</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aparna</namePart>
<namePart type="family">Nurani Venkitasubramanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tinne</namePart>
<namePart type="family">Tuytelaars</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Workshop on Vision and Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erkut</namePart>
<namePart type="family">Erdem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katerina</namePart>
<namePart type="family">Pastra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Krystian</namePart>
<namePart type="family">Mikolajczyk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Valencia, Spain</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We investigate animal recognition models learned from wildlife video documentaries by using the weak supervision of the textual subtitles. This is a particularly challenging setting, since i) the animals occur in their natural habitat and are often largely occluded and ii) subtitles are to a large degree complementary to the visual content, providing a very weak supervisory signal. This is in contrast to most work on integrated vision and language in the literature, where textual descriptions are tightly linked to the image content, and often generated in a curated fashion for the task at hand. In particular, we investigate different image representations and models, including a support vector machine on top of activations of a pretrained convolutional neural network, as well as a Naive Bayes framework on a ‘bag-of-activations’ image representation, where each element of the bag is considered separately. This representation allows key components in the image to be isolated, in spite of largely varying backgrounds and image clutter, without an object detection or image segmentation step. The methods are evaluated based on how well they transfer to unseen camera-trap images captured across diverse topographical regions under different environmental conditions and illumination settings, involving a large domain shift.</abstract>
<identifier type="citekey">nurani-venkitasubramanian-etal-2017-learning</identifier>
<identifier type="doi">10.18653/v1/W17-2003</identifier>
<location>
<url>https://aclanthology.org/W17-2003</url>
</location>
<part>
<date>2017-04</date>
<extent unit="page">
<start>21</start>
<end>30</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning to Recognize Animals by Watching Documentaries: Using Subtitles as Weak Supervision
%A Nurani Venkitasubramanian, Aparna
%A Tuytelaars, Tinne
%A Moens, Marie-Francine
%Y Belz, Anya
%Y Erdem, Erkut
%Y Pastra, Katerina
%Y Mikolajczyk, Krystian
%S Proceedings of the Sixth Workshop on Vision and Language
%D 2017
%8 April
%I Association for Computational Linguistics
%C Valencia, Spain
%F nurani-venkitasubramanian-etal-2017-learning
%X We investigate animal recognition models learned from wildlife video documentaries by using the weak supervision of the textual subtitles. This is a particularly challenging setting, since i) the animals occur in their natural habitat and are often largely occluded and ii) subtitles are to a large degree complementary to the visual content, providing a very weak supervisory signal. This is in contrast to most work on integrated vision and language in the literature, where textual descriptions are tightly linked to the image content, and often generated in a curated fashion for the task at hand. In particular, we investigate different image representations and models, including a support vector machine on top of activations of a pretrained convolutional neural network, as well as a Naive Bayes framework on a ‘bag-of-activations’ image representation, where each element of the bag is considered separately. This representation allows key components in the image to be isolated, in spite of largely varying backgrounds and image clutter, without an object detection or image segmentation step. The methods are evaluated based on how well they transfer to unseen camera-trap images captured across diverse topographical regions under different environmental conditions and illumination settings, involving a large domain shift.
%R 10.18653/v1/W17-2003
%U https://aclanthology.org/W17-2003
%U https://doi.org/10.18653/v1/W17-2003
%P 21-30
Markdown (Informal)
[Learning to Recognize Animals by Watching Documentaries: Using Subtitles as Weak Supervision](https://aclanthology.org/W17-2003) (Nurani Venkitasubramanian et al., VL 2017)
ACL