@inproceedings{wang-etal-2018-bridge,
title = "Bridge Video and Text with Cascade Syntactic Structure",
author = "Wang, Guolong and
Qin, Zheng and
Xu, Kaiping and
Huang, Kai and
Ye, Shuxiong",
editor = "Bender, Emily M. and
Derczynski, Leon and
Isabelle, Pierre",
booktitle = "Proceedings of the 27th International Conference on Computational Linguistics",
month = aug,
year = "2018",
address = "Santa Fe, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/C18-1303/",
pages = "3576--3585",
abstract = "We present a video captioning approach that encodes features by progressively completing syntactic structure (LSTM-CSS). To construct basic syntactic structure (i.e., subject, predicate, and object), we use a Conditional Random Field to label semantic representations (i.e., motions, objects). We argue that in order to improve the comprehensiveness of the description, the local features within object regions can be used to generate complementary syntactic elements (e.g., attribute, adverbial). Inspired by redundancy of human receptors, we utilize a Region Proposal Network to focus on the object regions. To model the final temporal dynamics, Recurrent Neural Network with Path Embeddings is adopted. We demonstrate the effectiveness of LSTM-CSS on generating natural sentences: 42.3{\%} and 28.5{\%} in terms of BLEU@4 and METEOR. Superior performance when compared to state-of-the-art methods are reported on a large video description dataset (i.e., MSR-VTT-2016)."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2018-bridge">
<titleInfo>
<title>Bridge Video and Text with Cascade Syntactic Structure</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guolong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Qin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaiping</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuxiong</namePart>
<namePart type="family">Ye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 27th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Bender</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leon</namePart>
<namePart type="family">Derczynski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Isabelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Santa Fe, New Mexico, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a video captioning approach that encodes features by progressively completing syntactic structure (LSTM-CSS). To construct basic syntactic structure (i.e., subject, predicate, and object), we use a Conditional Random Field to label semantic representations (i.e., motions, objects). We argue that in order to improve the comprehensiveness of the description, the local features within object regions can be used to generate complementary syntactic elements (e.g., attribute, adverbial). Inspired by redundancy of human receptors, we utilize a Region Proposal Network to focus on the object regions. To model the final temporal dynamics, Recurrent Neural Network with Path Embeddings is adopted. We demonstrate the effectiveness of LSTM-CSS on generating natural sentences: 42.3% and 28.5% in terms of BLEU@4 and METEOR. Superior performance when compared to state-of-the-art methods are reported on a large video description dataset (i.e., MSR-VTT-2016).</abstract>
<identifier type="citekey">wang-etal-2018-bridge</identifier>
<location>
<url>https://aclanthology.org/C18-1303/</url>
</location>
<part>
<date>2018-08</date>
<extent unit="page">
<start>3576</start>
<end>3585</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bridge Video and Text with Cascade Syntactic Structure
%A Wang, Guolong
%A Qin, Zheng
%A Xu, Kaiping
%A Huang, Kai
%A Ye, Shuxiong
%Y Bender, Emily M.
%Y Derczynski, Leon
%Y Isabelle, Pierre
%S Proceedings of the 27th International Conference on Computational Linguistics
%D 2018
%8 August
%I Association for Computational Linguistics
%C Santa Fe, New Mexico, USA
%F wang-etal-2018-bridge
%X We present a video captioning approach that encodes features by progressively completing syntactic structure (LSTM-CSS). To construct basic syntactic structure (i.e., subject, predicate, and object), we use a Conditional Random Field to label semantic representations (i.e., motions, objects). We argue that in order to improve the comprehensiveness of the description, the local features within object regions can be used to generate complementary syntactic elements (e.g., attribute, adverbial). Inspired by redundancy of human receptors, we utilize a Region Proposal Network to focus on the object regions. To model the final temporal dynamics, Recurrent Neural Network with Path Embeddings is adopted. We demonstrate the effectiveness of LSTM-CSS on generating natural sentences: 42.3% and 28.5% in terms of BLEU@4 and METEOR. Superior performance when compared to state-of-the-art methods are reported on a large video description dataset (i.e., MSR-VTT-2016).
%U https://aclanthology.org/C18-1303/
%P 3576-3585
Markdown (Informal)
[Bridge Video and Text with Cascade Syntactic Structure](https://aclanthology.org/C18-1303/) (Wang et al., COLING 2018)
ACL
- Guolong Wang, Zheng Qin, Kaiping Xu, Kai Huang, and Shuxiong Ye. 2018. Bridge Video and Text with Cascade Syntactic Structure. In Proceedings of the 27th International Conference on Computational Linguistics, pages 3576–3585, Santa Fe, New Mexico, USA. Association for Computational Linguistics.