@inproceedings{gonzales-rudzicz-2024-retrieval,
title = "A Retrieval Augmented Approach for Text-to-Music Generation",
author = "Gonzales, Robie and
Rudzicz, Frank",
editor = "Kruspe, Anna and
Oramas, Sergio and
Epure, Elena V. and
Sordo, Mohamed and
Weck, Benno and
Doh, SeungHeon and
Won, Minz and
Manco, Ilaria and
Meseguer-Brocal, Gabriel",
booktitle = "Proceedings of the 3rd Workshop on NLP for Music and Audio (NLP4MusA)",
month = nov,
year = "2024",
address = "Oakland, USA",
publisher = "Association for Computational Lingustics",
url = "https://aclanthology.org/2024.nlp4musa-1.6/",
pages = "31--36",
abstract = "Generative text-to-music models such as MusicGen are capable of generating high fidelity music conditioned on a text prompt. However, expressing the essential features of music with text is a challenging task. In this paper, we present a retrieval-augmented approach for text-to-music generation. We first pre-compute a dataset of text-music embeddings obtained from a contrastive language-audio pretrained encoder (CLAP). Then, given an input text prompt, we retrieve the top $k$ most similar musical aspects and augment the original prompt. This approach consistently generates music of higher audio quality as measured by the Frech{\'e}t Audio Distance. We analyze the internal representations of MusicGen and find that augmented prompts lead to greater diversity in token distributions and display high text adherence. Our findings show the potential for increased control in text-to-music generation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gonzales-rudzicz-2024-retrieval">
<titleInfo>
<title>A Retrieval Augmented Approach for Text-to-Music Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Robie</namePart>
<namePart type="family">Gonzales</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frank</namePart>
<namePart type="family">Rudzicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Workshop on NLP for Music and Audio (NLP4MusA)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Kruspe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergio</namePart>
<namePart type="family">Oramas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="given">V</namePart>
<namePart type="family">Epure</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Sordo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benno</namePart>
<namePart type="family">Weck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">SeungHeon</namePart>
<namePart type="family">Doh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minz</namePart>
<namePart type="family">Won</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ilaria</namePart>
<namePart type="family">Manco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Meseguer-Brocal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Lingustics</publisher>
<place>
<placeTerm type="text">Oakland, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Generative text-to-music models such as MusicGen are capable of generating high fidelity music conditioned on a text prompt. However, expressing the essential features of music with text is a challenging task. In this paper, we present a retrieval-augmented approach for text-to-music generation. We first pre-compute a dataset of text-music embeddings obtained from a contrastive language-audio pretrained encoder (CLAP). Then, given an input text prompt, we retrieve the top k most similar musical aspects and augment the original prompt. This approach consistently generates music of higher audio quality as measured by the Frechét Audio Distance. We analyze the internal representations of MusicGen and find that augmented prompts lead to greater diversity in token distributions and display high text adherence. Our findings show the potential for increased control in text-to-music generation.</abstract>
<identifier type="citekey">gonzales-rudzicz-2024-retrieval</identifier>
<location>
<url>https://aclanthology.org/2024.nlp4musa-1.6/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>31</start>
<end>36</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Retrieval Augmented Approach for Text-to-Music Generation
%A Gonzales, Robie
%A Rudzicz, Frank
%Y Kruspe, Anna
%Y Oramas, Sergio
%Y Epure, Elena V.
%Y Sordo, Mohamed
%Y Weck, Benno
%Y Doh, SeungHeon
%Y Won, Minz
%Y Manco, Ilaria
%Y Meseguer-Brocal, Gabriel
%S Proceedings of the 3rd Workshop on NLP for Music and Audio (NLP4MusA)
%D 2024
%8 November
%I Association for Computational Lingustics
%C Oakland, USA
%F gonzales-rudzicz-2024-retrieval
%X Generative text-to-music models such as MusicGen are capable of generating high fidelity music conditioned on a text prompt. However, expressing the essential features of music with text is a challenging task. In this paper, we present a retrieval-augmented approach for text-to-music generation. We first pre-compute a dataset of text-music embeddings obtained from a contrastive language-audio pretrained encoder (CLAP). Then, given an input text prompt, we retrieve the top k most similar musical aspects and augment the original prompt. This approach consistently generates music of higher audio quality as measured by the Frechét Audio Distance. We analyze the internal representations of MusicGen and find that augmented prompts lead to greater diversity in token distributions and display high text adherence. Our findings show the potential for increased control in text-to-music generation.
%U https://aclanthology.org/2024.nlp4musa-1.6/
%P 31-36
Markdown (Informal)
[A Retrieval Augmented Approach for Text-to-Music Generation](https://aclanthology.org/2024.nlp4musa-1.6/) (Gonzales & Rudzicz, NLP4MusA 2024)
ACL