@inproceedings{hsu-etal-2021-stretch,
title = "Stretch-{VST}: Getting Flexible With Visual Stories",
author = "Hsu, Chi-yang and
Chu, Yun-Wei and
Yang, Tsai-Lun and
Huang, Ting-Hao and
Ku, Lun-Wei",
editor = "Ji, Heng and
Park, Jong C. and
Xia, Rui",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: System Demonstrations",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.acl-demo.42",
doi = "10.18653/v1/2021.acl-demo.42",
pages = "356--362",
abstract = "In visual storytelling, a short story is generated based on a given image sequence. Despite years of work, most visual storytelling models remain limited in terms of the generated stories{'} fixed length: most models produce stories with exactly five sentences because five-sentence stories dominate the training data. The fix-length stories carry limited details and provide ambiguous textual information to the readers. Therefore, we propose to {``}stretch{''} the stories, which create the potential to present in-depth visual details. This paper presents Stretch-VST, a visual storytelling framework that enables the generation of prolonged stories by adding appropriate knowledge, which is selected by the proposed scoring function. We propose a length-controlled Transformer to generate long stories. This model introduces novel positional encoding methods to maintain story quality with lengthy inputs. Experiments confirm that long stories are generated without deteriorating the quality. The human evaluation further shows that Stretch-VST can provide better focus and detail when stories are prolonged compared to state of the art. We create a webpage to demonstrate our prolonged capability.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hsu-etal-2021-stretch">
<titleInfo>
<title>Stretch-VST: Getting Flexible With Visual Stories</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chi-yang</namePart>
<namePart type="family">Hsu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Wei</namePart>
<namePart type="family">Chu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tsai-Lun</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ting-Hao</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jong</namePart>
<namePart type="given">C</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In visual storytelling, a short story is generated based on a given image sequence. Despite years of work, most visual storytelling models remain limited in terms of the generated stories’ fixed length: most models produce stories with exactly five sentences because five-sentence stories dominate the training data. The fix-length stories carry limited details and provide ambiguous textual information to the readers. Therefore, we propose to “stretch” the stories, which create the potential to present in-depth visual details. This paper presents Stretch-VST, a visual storytelling framework that enables the generation of prolonged stories by adding appropriate knowledge, which is selected by the proposed scoring function. We propose a length-controlled Transformer to generate long stories. This model introduces novel positional encoding methods to maintain story quality with lengthy inputs. Experiments confirm that long stories are generated without deteriorating the quality. The human evaluation further shows that Stretch-VST can provide better focus and detail when stories are prolonged compared to state of the art. We create a webpage to demonstrate our prolonged capability.</abstract>
<identifier type="citekey">hsu-etal-2021-stretch</identifier>
<identifier type="doi">10.18653/v1/2021.acl-demo.42</identifier>
<location>
<url>https://aclanthology.org/2021.acl-demo.42</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>356</start>
<end>362</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Stretch-VST: Getting Flexible With Visual Stories
%A Hsu, Chi-yang
%A Chu, Yun-Wei
%A Yang, Tsai-Lun
%A Huang, Ting-Hao
%A Ku, Lun-Wei
%Y Ji, Heng
%Y Park, Jong C.
%Y Xia, Rui
%S Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: System Demonstrations
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F hsu-etal-2021-stretch
%X In visual storytelling, a short story is generated based on a given image sequence. Despite years of work, most visual storytelling models remain limited in terms of the generated stories’ fixed length: most models produce stories with exactly five sentences because five-sentence stories dominate the training data. The fix-length stories carry limited details and provide ambiguous textual information to the readers. Therefore, we propose to “stretch” the stories, which create the potential to present in-depth visual details. This paper presents Stretch-VST, a visual storytelling framework that enables the generation of prolonged stories by adding appropriate knowledge, which is selected by the proposed scoring function. We propose a length-controlled Transformer to generate long stories. This model introduces novel positional encoding methods to maintain story quality with lengthy inputs. Experiments confirm that long stories are generated without deteriorating the quality. The human evaluation further shows that Stretch-VST can provide better focus and detail when stories are prolonged compared to state of the art. We create a webpage to demonstrate our prolonged capability.
%R 10.18653/v1/2021.acl-demo.42
%U https://aclanthology.org/2021.acl-demo.42
%U https://doi.org/10.18653/v1/2021.acl-demo.42
%P 356-362
Markdown (Informal)
[Stretch-VST: Getting Flexible With Visual Stories](https://aclanthology.org/2021.acl-demo.42) (Hsu et al., ACL-IJCNLP 2021)
ACL
- Chi-yang Hsu, Yun-Wei Chu, Tsai-Lun Yang, Ting-Hao Huang, and Lun-Wei Ku. 2021. Stretch-VST: Getting Flexible With Visual Stories. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: System Demonstrations, pages 356–362, Online. Association for Computational Linguistics.