@inproceedings{xiao-etal-2019-guiding,
title = "Guiding the Flowing of Semantics: Interpretable Video Captioning via {POS} Tag",
author = "Xiao, Xinyu and
Wang, Lingfeng and
Fan, Bin and
Xiang, Shinming and
Pan, Chunhong",
editor = "Inui, Kentaro and
Jiang, Jing and
Ng, Vincent and
Wan, Xiaojun",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D19-1213",
doi = "10.18653/v1/D19-1213",
pages = "2068--2077",
abstract = "In the current video captioning models, the video frames are collected in one network and the semantics are mixed into one feature, which not only increase the difficulty of the caption decoding, but also decrease the interpretability of the captioning models. To address these problems, we propose an Adaptive Semantic Guidance Network (ASGN), which instantiates the whole video semantics to different POS-aware semantics with the supervision of part of speech (POS) tag. In the encoding process, the POS tag activates the related neurons and parses the whole semantic information into corresponding encoded video representations. Furthermore, the potential of the model is stimulated by the POS-aware video features. In the decoding process, the related video features of noun and verb are used as the supervision to construct a new adaptive attention model which can decide whether to attend to the video feature or not. With the explicit improving of the interpretability of the network, the learning process is more transparent and the results are more predictable. Extensive experiments demonstrate the effectiveness of our model when compared with state-of-the-art models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xiao-etal-2019-guiding">
<titleInfo>
<title>Guiding the Flowing of Semantics: Interpretable Video Captioning via POS Tag</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xinyu</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lingfeng</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bin</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shinming</namePart>
<namePart type="family">Xiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chunhong</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vincent</namePart>
<namePart type="family">Ng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaojun</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hong Kong, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In the current video captioning models, the video frames are collected in one network and the semantics are mixed into one feature, which not only increase the difficulty of the caption decoding, but also decrease the interpretability of the captioning models. To address these problems, we propose an Adaptive Semantic Guidance Network (ASGN), which instantiates the whole video semantics to different POS-aware semantics with the supervision of part of speech (POS) tag. In the encoding process, the POS tag activates the related neurons and parses the whole semantic information into corresponding encoded video representations. Furthermore, the potential of the model is stimulated by the POS-aware video features. In the decoding process, the related video features of noun and verb are used as the supervision to construct a new adaptive attention model which can decide whether to attend to the video feature or not. With the explicit improving of the interpretability of the network, the learning process is more transparent and the results are more predictable. Extensive experiments demonstrate the effectiveness of our model when compared with state-of-the-art models.</abstract>
<identifier type="citekey">xiao-etal-2019-guiding</identifier>
<identifier type="doi">10.18653/v1/D19-1213</identifier>
<location>
<url>https://aclanthology.org/D19-1213</url>
</location>
<part>
<date>2019-11</date>
<extent unit="page">
<start>2068</start>
<end>2077</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Guiding the Flowing of Semantics: Interpretable Video Captioning via POS Tag
%A Xiao, Xinyu
%A Wang, Lingfeng
%A Fan, Bin
%A Xiang, Shinming
%A Pan, Chunhong
%Y Inui, Kentaro
%Y Jiang, Jing
%Y Ng, Vincent
%Y Wan, Xiaojun
%S Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)
%D 2019
%8 November
%I Association for Computational Linguistics
%C Hong Kong, China
%F xiao-etal-2019-guiding
%X In the current video captioning models, the video frames are collected in one network and the semantics are mixed into one feature, which not only increase the difficulty of the caption decoding, but also decrease the interpretability of the captioning models. To address these problems, we propose an Adaptive Semantic Guidance Network (ASGN), which instantiates the whole video semantics to different POS-aware semantics with the supervision of part of speech (POS) tag. In the encoding process, the POS tag activates the related neurons and parses the whole semantic information into corresponding encoded video representations. Furthermore, the potential of the model is stimulated by the POS-aware video features. In the decoding process, the related video features of noun and verb are used as the supervision to construct a new adaptive attention model which can decide whether to attend to the video feature or not. With the explicit improving of the interpretability of the network, the learning process is more transparent and the results are more predictable. Extensive experiments demonstrate the effectiveness of our model when compared with state-of-the-art models.
%R 10.18653/v1/D19-1213
%U https://aclanthology.org/D19-1213
%U https://doi.org/10.18653/v1/D19-1213
%P 2068-2077
Markdown (Informal)
[Guiding the Flowing of Semantics: Interpretable Video Captioning via POS Tag](https://aclanthology.org/D19-1213) (Xiao et al., EMNLP-IJCNLP 2019)
ACL
- Xinyu Xiao, Lingfeng Wang, Bin Fan, Shinming Xiang, and Chunhong Pan. 2019. Guiding the Flowing of Semantics: Interpretable Video Captioning via POS Tag. In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pages 2068–2077, Hong Kong, China. Association for Computational Linguistics.