@article{long-etal-2018-video,
title = "Video Captioning with Multi-Faceted Attention",
author = "Long, Xiang and
Gan, Chuang and
de Melo, Gerard",
editor = "Lee, Lillian and
Johnson, Mark and
Toutanova, Kristina and
Roark, Brian",
journal = "Transactions of the Association for Computational Linguistics",
volume = "6",
year = "2018",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/Q18-1013",
doi = "10.1162/tacl_a_00013",
pages = "173--184",
abstract = "Video captioning has attracted an increasing amount of interest, due in part to its potential for improved accessibility and information retrieval. While existing methods rely on different kinds of visual features and model architectures, they do not make full use of pertinent semantic cues. We present a unified and extensible framework to jointly leverage multiple sorts of visual features and semantic attributes. Our novel architecture builds on LSTMs with two multi-faceted attention layers. These first learn to automatically select the most salient visual features or semantic attributes, and then yield overall representations for the input and output of the sentence generation component via custom feature scaling operations. Experimental results on the challenging MSVD and MSR-VTT datasets show that our framework outperforms previous work and performs robustly even in the presence of added noise to the features and attributes.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="long-etal-2018-video">
<titleInfo>
<title>Video Captioning with Multi-Faceted Attention</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Long</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chuang</namePart>
<namePart type="family">Gan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerard</namePart>
<namePart type="family">de Melo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Video captioning has attracted an increasing amount of interest, due in part to its potential for improved accessibility and information retrieval. While existing methods rely on different kinds of visual features and model architectures, they do not make full use of pertinent semantic cues. We present a unified and extensible framework to jointly leverage multiple sorts of visual features and semantic attributes. Our novel architecture builds on LSTMs with two multi-faceted attention layers. These first learn to automatically select the most salient visual features or semantic attributes, and then yield overall representations for the input and output of the sentence generation component via custom feature scaling operations. Experimental results on the challenging MSVD and MSR-VTT datasets show that our framework outperforms previous work and performs robustly even in the presence of added noise to the features and attributes.</abstract>
<identifier type="citekey">long-etal-2018-video</identifier>
<identifier type="doi">10.1162/tacl_a_00013</identifier>
<location>
<url>https://aclanthology.org/Q18-1013</url>
</location>
<part>
<date>2018</date>
<detail type="volume"><number>6</number></detail>
<extent unit="page">
<start>173</start>
<end>184</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Video Captioning with Multi-Faceted Attention
%A Long, Xiang
%A Gan, Chuang
%A de Melo, Gerard
%J Transactions of the Association for Computational Linguistics
%D 2018
%V 6
%I MIT Press
%C Cambridge, MA
%F long-etal-2018-video
%X Video captioning has attracted an increasing amount of interest, due in part to its potential for improved accessibility and information retrieval. While existing methods rely on different kinds of visual features and model architectures, they do not make full use of pertinent semantic cues. We present a unified and extensible framework to jointly leverage multiple sorts of visual features and semantic attributes. Our novel architecture builds on LSTMs with two multi-faceted attention layers. These first learn to automatically select the most salient visual features or semantic attributes, and then yield overall representations for the input and output of the sentence generation component via custom feature scaling operations. Experimental results on the challenging MSVD and MSR-VTT datasets show that our framework outperforms previous work and performs robustly even in the presence of added noise to the features and attributes.
%R 10.1162/tacl_a_00013
%U https://aclanthology.org/Q18-1013
%U https://doi.org/10.1162/tacl_a_00013
%P 173-184
Markdown (Informal)
[Video Captioning with Multi-Faceted Attention](https://aclanthology.org/Q18-1013) (Long et al., TACL 2018)
ACL