@inproceedings{alharbi-gotoh-2017-natural,
title = "Natural Language Descriptions for Human Activities in Video Streams",
author = "Alharbi, Nouf and
Gotoh, Yoshihiko",
editor = "Alonso, Jose M. and
Bugar{\'i}n, Alberto and
Reiter, Ehud",
booktitle = "Proceedings of the 10th International Conference on Natural Language Generation",
month = sep,
year = "2017",
address = "Santiago de Compostela, Spain",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W17-3512/",
doi = "10.18653/v1/W17-3512",
pages = "85--94",
abstract = "There has been continuous growth in the volume and ubiquity of video material. It has become essential to define video semantics in order to aid the searchability and retrieval of this data. We present a framework that produces textual descriptions of video, based on the visual semantic content. Detected action classes rendered as verbs, participant objects converted to noun phrases, visual properties of detected objects rendered as adjectives and spatial relations between objects rendered as prepositions. Further, in cases of zero-shot action recognition, a language model is used to infer a missing verb, aided by the detection of objects and scene settings. These extracted features are converted into textual descriptions using a template-based approach. The proposed video descriptions framework evaluated on the NLDHA dataset using ROUGE scores and human judgment evaluation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alharbi-gotoh-2017-natural">
<titleInfo>
<title>Natural Language Descriptions for Human Activities in Video Streams</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nouf</namePart>
<namePart type="family">Alharbi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoshihiko</namePart>
<namePart type="family">Gotoh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th International Conference on Natural Language Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jose</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Alonso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alberto</namePart>
<namePart type="family">Bugarín</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehud</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Santiago de Compostela, Spain</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>There has been continuous growth in the volume and ubiquity of video material. It has become essential to define video semantics in order to aid the searchability and retrieval of this data. We present a framework that produces textual descriptions of video, based on the visual semantic content. Detected action classes rendered as verbs, participant objects converted to noun phrases, visual properties of detected objects rendered as adjectives and spatial relations between objects rendered as prepositions. Further, in cases of zero-shot action recognition, a language model is used to infer a missing verb, aided by the detection of objects and scene settings. These extracted features are converted into textual descriptions using a template-based approach. The proposed video descriptions framework evaluated on the NLDHA dataset using ROUGE scores and human judgment evaluation.</abstract>
<identifier type="citekey">alharbi-gotoh-2017-natural</identifier>
<identifier type="doi">10.18653/v1/W17-3512</identifier>
<location>
<url>https://aclanthology.org/W17-3512/</url>
</location>
<part>
<date>2017-09</date>
<extent unit="page">
<start>85</start>
<end>94</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Natural Language Descriptions for Human Activities in Video Streams
%A Alharbi, Nouf
%A Gotoh, Yoshihiko
%Y Alonso, Jose M.
%Y Bugarín, Alberto
%Y Reiter, Ehud
%S Proceedings of the 10th International Conference on Natural Language Generation
%D 2017
%8 September
%I Association for Computational Linguistics
%C Santiago de Compostela, Spain
%F alharbi-gotoh-2017-natural
%X There has been continuous growth in the volume and ubiquity of video material. It has become essential to define video semantics in order to aid the searchability and retrieval of this data. We present a framework that produces textual descriptions of video, based on the visual semantic content. Detected action classes rendered as verbs, participant objects converted to noun phrases, visual properties of detected objects rendered as adjectives and spatial relations between objects rendered as prepositions. Further, in cases of zero-shot action recognition, a language model is used to infer a missing verb, aided by the detection of objects and scene settings. These extracted features are converted into textual descriptions using a template-based approach. The proposed video descriptions framework evaluated on the NLDHA dataset using ROUGE scores and human judgment evaluation.
%R 10.18653/v1/W17-3512
%U https://aclanthology.org/W17-3512/
%U https://doi.org/10.18653/v1/W17-3512
%P 85-94
Markdown (Informal)
[Natural Language Descriptions for Human Activities in Video Streams](https://aclanthology.org/W17-3512/) (Alharbi & Gotoh, INLG 2017)
ACL