@inproceedings{song-etal-2024-context-aware,
title = "Context-aware Visual Storytelling with Visual Prefix Tuning and Contrastive Learning",
author = "Song, Yingjin and
Paperno, Denis and
Gatt, Albert",
editor = "Mahamood, Saad and
Minh, Nguyen Le and
Ippolito, Daphne",
booktitle = "Proceedings of the 17th International Natural Language Generation Conference",
month = sep,
year = "2024",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.inlg-main.32",
pages = "384--401",
abstract = "Visual storytelling systems generate multi-sentence stories from image sequences. In this task, capturing contextual information and bridging visual variation bring additional challenges. We propose a simple yet effective framework that leverages the generalization capabilities of pretrained foundation models, only training a lightweight vision-language mapping network to connect modalities, while incorporating context to enhance coherence. We introduce a multimodal contrastive objective that also improves visual relevance and story informativeness. Extensive experimental results, across both automatic metrics and human evaluations, demonstrate that the stories generated by our framework are diverse, coherent, informative, and interesting.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="song-etal-2024-context-aware">
<titleInfo>
<title>Context-aware Visual Storytelling with Visual Prefix Tuning and Contrastive Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yingjin</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Denis</namePart>
<namePart type="family">Paperno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Albert</namePart>
<namePart type="family">Gatt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Natural Language Generation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saad</namePart>
<namePart type="family">Mahamood</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nguyen</namePart>
<namePart type="given">Le</namePart>
<namePart type="family">Minh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daphne</namePart>
<namePart type="family">Ippolito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Tokyo, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Visual storytelling systems generate multi-sentence stories from image sequences. In this task, capturing contextual information and bridging visual variation bring additional challenges. We propose a simple yet effective framework that leverages the generalization capabilities of pretrained foundation models, only training a lightweight vision-language mapping network to connect modalities, while incorporating context to enhance coherence. We introduce a multimodal contrastive objective that also improves visual relevance and story informativeness. Extensive experimental results, across both automatic metrics and human evaluations, demonstrate that the stories generated by our framework are diverse, coherent, informative, and interesting.</abstract>
<identifier type="citekey">song-etal-2024-context-aware</identifier>
<location>
<url>https://aclanthology.org/2024.inlg-main.32</url>
</location>
<part>
<date>2024-09</date>
<extent unit="page">
<start>384</start>
<end>401</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Context-aware Visual Storytelling with Visual Prefix Tuning and Contrastive Learning
%A Song, Yingjin
%A Paperno, Denis
%A Gatt, Albert
%Y Mahamood, Saad
%Y Minh, Nguyen Le
%Y Ippolito, Daphne
%S Proceedings of the 17th International Natural Language Generation Conference
%D 2024
%8 September
%I Association for Computational Linguistics
%C Tokyo, Japan
%F song-etal-2024-context-aware
%X Visual storytelling systems generate multi-sentence stories from image sequences. In this task, capturing contextual information and bridging visual variation bring additional challenges. We propose a simple yet effective framework that leverages the generalization capabilities of pretrained foundation models, only training a lightweight vision-language mapping network to connect modalities, while incorporating context to enhance coherence. We introduce a multimodal contrastive objective that also improves visual relevance and story informativeness. Extensive experimental results, across both automatic metrics and human evaluations, demonstrate that the stories generated by our framework are diverse, coherent, informative, and interesting.
%U https://aclanthology.org/2024.inlg-main.32
%P 384-401
Markdown (Informal)
[Context-aware Visual Storytelling with Visual Prefix Tuning and Contrastive Learning](https://aclanthology.org/2024.inlg-main.32) (Song et al., INLG 2024)
ACL