@inproceedings{zang-zhang-2024-interpretation,
title = "The Interpretation Gap in Text-to-Music Generation Models",
author = "Zang, Yongyi and
Zhang, Yixiao",
editor = "Kruspe, Anna and
Oramas, Sergio and
Epure, Elena V. and
Sordo, Mohamed and
Weck, Benno and
Doh, SeungHeon and
Won, Minz and
Manco, Ilaria and
Meseguer-Brocal, Gabriel",
booktitle = "Proceedings of the 3rd Workshop on NLP for Music and Audio (NLP4MusA)",
month = nov,
year = "2024",
address = "Oakland, USA",
publisher = "Association for Computational Lingustics",
url = "https://aclanthology.org/2024.nlp4musa-1.18/",
pages = "112--118",
abstract = "Large-scale text-to-music generation models have significantly enhanced music creation capabilities, offering unprecedented creative freedom. However, their ability to collaborate effectively with human musicians remains limited. In this paper, we propose a framework to describe the musical interaction process, which includes expression, interpretation, and execution of controls. Following this framework, we argue that the primary gap between existing text-to-music models and musicians lies in the interpretation stage, where models lack the ability to interpret controls from musicians. We also propose two strategies to address this gap and call on the music information retrieval community to tackle the interpretation challenge to improve human-AI musical collaboration."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zang-zhang-2024-interpretation">
<titleInfo>
<title>The Interpretation Gap in Text-to-Music Generation Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yongyi</namePart>
<namePart type="family">Zang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yixiao</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Workshop on NLP for Music and Audio (NLP4MusA)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Kruspe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergio</namePart>
<namePart type="family">Oramas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="given">V</namePart>
<namePart type="family">Epure</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Sordo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benno</namePart>
<namePart type="family">Weck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">SeungHeon</namePart>
<namePart type="family">Doh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minz</namePart>
<namePart type="family">Won</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ilaria</namePart>
<namePart type="family">Manco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Meseguer-Brocal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Lingustics</publisher>
<place>
<placeTerm type="text">Oakland, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large-scale text-to-music generation models have significantly enhanced music creation capabilities, offering unprecedented creative freedom. However, their ability to collaborate effectively with human musicians remains limited. In this paper, we propose a framework to describe the musical interaction process, which includes expression, interpretation, and execution of controls. Following this framework, we argue that the primary gap between existing text-to-music models and musicians lies in the interpretation stage, where models lack the ability to interpret controls from musicians. We also propose two strategies to address this gap and call on the music information retrieval community to tackle the interpretation challenge to improve human-AI musical collaboration.</abstract>
<identifier type="citekey">zang-zhang-2024-interpretation</identifier>
<location>
<url>https://aclanthology.org/2024.nlp4musa-1.18/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>112</start>
<end>118</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Interpretation Gap in Text-to-Music Generation Models
%A Zang, Yongyi
%A Zhang, Yixiao
%Y Kruspe, Anna
%Y Oramas, Sergio
%Y Epure, Elena V.
%Y Sordo, Mohamed
%Y Weck, Benno
%Y Doh, SeungHeon
%Y Won, Minz
%Y Manco, Ilaria
%Y Meseguer-Brocal, Gabriel
%S Proceedings of the 3rd Workshop on NLP for Music and Audio (NLP4MusA)
%D 2024
%8 November
%I Association for Computational Lingustics
%C Oakland, USA
%F zang-zhang-2024-interpretation
%X Large-scale text-to-music generation models have significantly enhanced music creation capabilities, offering unprecedented creative freedom. However, their ability to collaborate effectively with human musicians remains limited. In this paper, we propose a framework to describe the musical interaction process, which includes expression, interpretation, and execution of controls. Following this framework, we argue that the primary gap between existing text-to-music models and musicians lies in the interpretation stage, where models lack the ability to interpret controls from musicians. We also propose two strategies to address this gap and call on the music information retrieval community to tackle the interpretation challenge to improve human-AI musical collaboration.
%U https://aclanthology.org/2024.nlp4musa-1.18/
%P 112-118
Markdown (Informal)
[The Interpretation Gap in Text-to-Music Generation Models](https://aclanthology.org/2024.nlp4musa-1.18/) (Zang & Zhang, NLP4MusA 2024)
ACL