@inproceedings{hong-etal-2024-summary,
title = "Summary of the Visually Grounded Story Generation Challenge",
author = "Hong, Xudong and
Sayeed, Asad and
Demberg, Vera",
editor = "Mille, Simon and
Clinciu, Miruna-Adriana",
booktitle = "Proceedings of the 17th International Natural Language Generation Conference: Generation Challenges",
month = sep,
year = "2024",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.inlg-genchal.3",
pages = "39--46",
abstract = "Recent advancements in vision-and-language models have opened new possibilities for natural language generation, particularly in generating creative stories from visual input. We thus host an open-sourced shared task, Visually Grounded Story Generation (VGSG), to explore whether these models can create coherent, diverse, and visually grounded narratives. This task challenges participants to generate coherent stories based on sequences of images, where characters and events must be grounded in the images provided. The task is structured into two tracks: the Closed track with constraints on fixed visual features and the Open track which allows all kinds of models. We propose the first two-stage model using GPT-4o as the baseline for the Open track that first generates descriptions for the images and then creates a story based on those descriptions. Human and automatic evaluations indicate that: 1) Retrieval augmentation helps generate more human-like stories, and 2) Largescale pre-trained LLM improves story quality by a large margin; 3) Traditional automatic metrics can not capture the overall quality.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hong-etal-2024-summary">
<titleInfo>
<title>Summary of the Visually Grounded Story Generation Challenge</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xudong</namePart>
<namePart type="family">Hong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asad</namePart>
<namePart type="family">Sayeed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Natural Language Generation Conference: Generation Challenges</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miruna-Adriana</namePart>
<namePart type="family">Clinciu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Tokyo, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent advancements in vision-and-language models have opened new possibilities for natural language generation, particularly in generating creative stories from visual input. We thus host an open-sourced shared task, Visually Grounded Story Generation (VGSG), to explore whether these models can create coherent, diverse, and visually grounded narratives. This task challenges participants to generate coherent stories based on sequences of images, where characters and events must be grounded in the images provided. The task is structured into two tracks: the Closed track with constraints on fixed visual features and the Open track which allows all kinds of models. We propose the first two-stage model using GPT-4o as the baseline for the Open track that first generates descriptions for the images and then creates a story based on those descriptions. Human and automatic evaluations indicate that: 1) Retrieval augmentation helps generate more human-like stories, and 2) Largescale pre-trained LLM improves story quality by a large margin; 3) Traditional automatic metrics can not capture the overall quality.</abstract>
<identifier type="citekey">hong-etal-2024-summary</identifier>
<location>
<url>https://aclanthology.org/2024.inlg-genchal.3</url>
</location>
<part>
<date>2024-09</date>
<extent unit="page">
<start>39</start>
<end>46</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Summary of the Visually Grounded Story Generation Challenge
%A Hong, Xudong
%A Sayeed, Asad
%A Demberg, Vera
%Y Mille, Simon
%Y Clinciu, Miruna-Adriana
%S Proceedings of the 17th International Natural Language Generation Conference: Generation Challenges
%D 2024
%8 September
%I Association for Computational Linguistics
%C Tokyo, Japan
%F hong-etal-2024-summary
%X Recent advancements in vision-and-language models have opened new possibilities for natural language generation, particularly in generating creative stories from visual input. We thus host an open-sourced shared task, Visually Grounded Story Generation (VGSG), to explore whether these models can create coherent, diverse, and visually grounded narratives. This task challenges participants to generate coherent stories based on sequences of images, where characters and events must be grounded in the images provided. The task is structured into two tracks: the Closed track with constraints on fixed visual features and the Open track which allows all kinds of models. We propose the first two-stage model using GPT-4o as the baseline for the Open track that first generates descriptions for the images and then creates a story based on those descriptions. Human and automatic evaluations indicate that: 1) Retrieval augmentation helps generate more human-like stories, and 2) Largescale pre-trained LLM improves story quality by a large margin; 3) Traditional automatic metrics can not capture the overall quality.
%U https://aclanthology.org/2024.inlg-genchal.3
%P 39-46
Markdown (Informal)
[Summary of the Visually Grounded Story Generation Challenge](https://aclanthology.org/2024.inlg-genchal.3) (Hong et al., INLG 2024)
ACL