@inproceedings{wang-etal-2025-generative,
title = "Generative Music Models' Alignment with Professional and Amateur Users' Expectations",
author = "Wang, Zihao and
Yu, Jiaxing and
Liu, Haoxuan and
Zheng, Zehui and
Jin, Yuhang and
Li, Shuyu and
Ji, Shulei and
Zhang, Kejun",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.360/",
doi = "10.18653/v1/2025.findings-acl.360",
pages = "6909--6920",
ISBN = "979-8-89176-256-5",
abstract = "Recent years have witnessed rapid advancements in text-to-music generation using large language models, yielding notable outputs. A critical challenge is understanding users with diverse musical expertise and generating music that meets their expectations, an area that remains underexplored.To address this gap, we introduce the novel task of Professional and Amateur Description-to-Song Generation. This task focuses on aligning generated content with human expressions from varying musical proficiency levels, aiming to produce songs that accurately meet auditory expectations and adhere to musical structural conventions. We utilized the MuChin dataset, which contains annotations from both professionals and amateurs for identical songs, as the source for these distinct description types. We also collected a pre-train dataset of over 1.5 million songs; lyrics were included for some, while for others, lyrics were generated using Automatic Speech Recognition (ASR) models.Furthermore, we propose MuDiT/MuSiT, a single-stage framework designed to enhance human-machine alignment in song generation. This framework employs Chinese MuLan (ChinMu) for cross-modal comprehension between natural language descriptions and auditory musical attributes, thereby aligning generated songs with user-defined outcomes. Concurrently, a DiT/SiT model facilitates end-to-end generation of complete songs audio, encompassing both vocals and instrumentation. We proposed metrics to evaluate semantic and auditory discrepancies between generated content and target music. Experimental results demonstrate that MuDiT/MuSiT outperforms baseline models and exhibits superior alignment with both professional and amateur song descriptions."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2025-generative">
<titleInfo>
<title>Generative Music Models’ Alignment with Professional and Amateur Users’ Expectations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zihao</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaxing</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haoxuan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zehui</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuhang</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuyu</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shulei</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kejun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Recent years have witnessed rapid advancements in text-to-music generation using large language models, yielding notable outputs. A critical challenge is understanding users with diverse musical expertise and generating music that meets their expectations, an area that remains underexplored.To address this gap, we introduce the novel task of Professional and Amateur Description-to-Song Generation. This task focuses on aligning generated content with human expressions from varying musical proficiency levels, aiming to produce songs that accurately meet auditory expectations and adhere to musical structural conventions. We utilized the MuChin dataset, which contains annotations from both professionals and amateurs for identical songs, as the source for these distinct description types. We also collected a pre-train dataset of over 1.5 million songs; lyrics were included for some, while for others, lyrics were generated using Automatic Speech Recognition (ASR) models.Furthermore, we propose MuDiT/MuSiT, a single-stage framework designed to enhance human-machine alignment in song generation. This framework employs Chinese MuLan (ChinMu) for cross-modal comprehension between natural language descriptions and auditory musical attributes, thereby aligning generated songs with user-defined outcomes. Concurrently, a DiT/SiT model facilitates end-to-end generation of complete songs audio, encompassing both vocals and instrumentation. We proposed metrics to evaluate semantic and auditory discrepancies between generated content and target music. Experimental results demonstrate that MuDiT/MuSiT outperforms baseline models and exhibits superior alignment with both professional and amateur song descriptions.</abstract>
<identifier type="citekey">wang-etal-2025-generative</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.360</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.360/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>6909</start>
<end>6920</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Generative Music Models’ Alignment with Professional and Amateur Users’ Expectations
%A Wang, Zihao
%A Yu, Jiaxing
%A Liu, Haoxuan
%A Zheng, Zehui
%A Jin, Yuhang
%A Li, Shuyu
%A Ji, Shulei
%A Zhang, Kejun
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F wang-etal-2025-generative
%X Recent years have witnessed rapid advancements in text-to-music generation using large language models, yielding notable outputs. A critical challenge is understanding users with diverse musical expertise and generating music that meets their expectations, an area that remains underexplored.To address this gap, we introduce the novel task of Professional and Amateur Description-to-Song Generation. This task focuses on aligning generated content with human expressions from varying musical proficiency levels, aiming to produce songs that accurately meet auditory expectations and adhere to musical structural conventions. We utilized the MuChin dataset, which contains annotations from both professionals and amateurs for identical songs, as the source for these distinct description types. We also collected a pre-train dataset of over 1.5 million songs; lyrics were included for some, while for others, lyrics were generated using Automatic Speech Recognition (ASR) models.Furthermore, we propose MuDiT/MuSiT, a single-stage framework designed to enhance human-machine alignment in song generation. This framework employs Chinese MuLan (ChinMu) for cross-modal comprehension between natural language descriptions and auditory musical attributes, thereby aligning generated songs with user-defined outcomes. Concurrently, a DiT/SiT model facilitates end-to-end generation of complete songs audio, encompassing both vocals and instrumentation. We proposed metrics to evaluate semantic and auditory discrepancies between generated content and target music. Experimental results demonstrate that MuDiT/MuSiT outperforms baseline models and exhibits superior alignment with both professional and amateur song descriptions.
%R 10.18653/v1/2025.findings-acl.360
%U https://aclanthology.org/2025.findings-acl.360/
%U https://doi.org/10.18653/v1/2025.findings-acl.360
%P 6909-6920
Markdown (Informal)
[Generative Music Models’ Alignment with Professional and Amateur Users’ Expectations](https://aclanthology.org/2025.findings-acl.360/) (Wang et al., Findings 2025)
ACL
- Zihao Wang, Jiaxing Yu, Haoxuan Liu, Zehui Zheng, Yuhang Jin, Shuyu Li, Shulei Ji, and Kejun Zhang. 2025. Generative Music Models’ Alignment with Professional and Amateur Users’ Expectations. In Findings of the Association for Computational Linguistics: ACL 2025, pages 6909–6920, Vienna, Austria. Association for Computational Linguistics.