@inproceedings{bae-etal-2023-sound,
title = "Sound of Story: Multi-modal Storytelling with Audio",
author = "Bae, Jaeyeon and
Jeong, Seokhoon and
Kang, Seokun and
Han, Namgi and
Lee, Jae-Yon and
Kim, Hyounghun and
Kim, Taehwan",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.898",
doi = "10.18653/v1/2023.findings-emnlp.898",
pages = "13467--13479",
abstract = "Storytelling is multi-modal in the real world. When one tells a story, one may use all of the visualizations and sounds along with the story itself. However, prior studies on storytelling datasets and tasks have paid little attention to sound even though sound also conveys meaningful semantics of the story. Therefore, we propose to extend story understanding and telling areas by establishing a new component called background sound which is story context-based audio without any linguistic information. For this purpose, we introduce a new dataset, called Sound of Story (SoS), which has paired image and text sequences with corresponding sound or background music for a story. To the best of our knowledge, this is the largest well-curated dataset for storytelling with sound. Our SoS dataset consists of 27,354 stories with 19.6 images per story and 984 hours of speech-decoupled audio such as background music and other sounds. As benchmark tasks for storytelling with sound and the dataset, we propose retrieval tasks between modalities, and audio generation tasks from image-text sequences, introducing strong baselines for them. We believe the proposed dataset and tasks may shed light on the multi-modal understanding of storytelling in terms of sound.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bae-etal-2023-sound">
<titleInfo>
<title>Sound of Story: Multi-modal Storytelling with Audio</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jaeyeon</namePart>
<namePart type="family">Bae</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhoon</namePart>
<namePart type="family">Jeong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokun</namePart>
<namePart type="family">Kang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Namgi</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jae-Yon</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyounghun</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Taehwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Storytelling is multi-modal in the real world. When one tells a story, one may use all of the visualizations and sounds along with the story itself. However, prior studies on storytelling datasets and tasks have paid little attention to sound even though sound also conveys meaningful semantics of the story. Therefore, we propose to extend story understanding and telling areas by establishing a new component called background sound which is story context-based audio without any linguistic information. For this purpose, we introduce a new dataset, called Sound of Story (SoS), which has paired image and text sequences with corresponding sound or background music for a story. To the best of our knowledge, this is the largest well-curated dataset for storytelling with sound. Our SoS dataset consists of 27,354 stories with 19.6 images per story and 984 hours of speech-decoupled audio such as background music and other sounds. As benchmark tasks for storytelling with sound and the dataset, we propose retrieval tasks between modalities, and audio generation tasks from image-text sequences, introducing strong baselines for them. We believe the proposed dataset and tasks may shed light on the multi-modal understanding of storytelling in terms of sound.</abstract>
<identifier type="citekey">bae-etal-2023-sound</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.898</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.898</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>13467</start>
<end>13479</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Sound of Story: Multi-modal Storytelling with Audio
%A Bae, Jaeyeon
%A Jeong, Seokhoon
%A Kang, Seokun
%A Han, Namgi
%A Lee, Jae-Yon
%A Kim, Hyounghun
%A Kim, Taehwan
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F bae-etal-2023-sound
%X Storytelling is multi-modal in the real world. When one tells a story, one may use all of the visualizations and sounds along with the story itself. However, prior studies on storytelling datasets and tasks have paid little attention to sound even though sound also conveys meaningful semantics of the story. Therefore, we propose to extend story understanding and telling areas by establishing a new component called background sound which is story context-based audio without any linguistic information. For this purpose, we introduce a new dataset, called Sound of Story (SoS), which has paired image and text sequences with corresponding sound or background music for a story. To the best of our knowledge, this is the largest well-curated dataset for storytelling with sound. Our SoS dataset consists of 27,354 stories with 19.6 images per story and 984 hours of speech-decoupled audio such as background music and other sounds. As benchmark tasks for storytelling with sound and the dataset, we propose retrieval tasks between modalities, and audio generation tasks from image-text sequences, introducing strong baselines for them. We believe the proposed dataset and tasks may shed light on the multi-modal understanding of storytelling in terms of sound.
%R 10.18653/v1/2023.findings-emnlp.898
%U https://aclanthology.org/2023.findings-emnlp.898
%U https://doi.org/10.18653/v1/2023.findings-emnlp.898
%P 13467-13479
Markdown (Informal)
[Sound of Story: Multi-modal Storytelling with Audio](https://aclanthology.org/2023.findings-emnlp.898) (Bae et al., Findings 2023)
ACL
- Jaeyeon Bae, Seokhoon Jeong, Seokun Kang, Namgi Han, Jae-Yon Lee, Hyounghun Kim, and Taehwan Kim. 2023. Sound of Story: Multi-modal Storytelling with Audio. In Findings of the Association for Computational Linguistics: EMNLP 2023, pages 13467–13479, Singapore. Association for Computational Linguistics.