@inproceedings{sourada-2026-thesis,
title = "Thesis Proposal: Multimodal Benchmark for Music Understanding in Large Language Models",
author = "Sourada, Tom{\'a}{\v{s}}",
editor = "Baez Santamaria, Selene and
Somayajula, Sai Ashish and
Yamaguchi, Atsuki",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 4: Student Research Workshop)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-srw.28/",
pages = "393--405",
ISBN = "979-8-89176-383-8",
abstract = "Music is a universal cultural practice that influences emotion, ritual and creativity, and it is now represented in many digital modalities: audio recordings, symbolic encodings (MIDI, MusicXML, ABC), visual scores and lyrics. Multimodal Large Language Models (MLLMs) have the ambition to process ``everything'', including music, and therefore promise to support musical analysis, creation and education. Despite this promise, systematic methods for evaluating whether a MLLM understands music are missing. Existing music-focused benchmarks are fragmented, largely single-modality, Western-centric, and often do not require actual perception of the musical content; methodological details such as prompt design and answer-extraction are frequently omitted or not discussed, and some evaluations rely on proprietary LLMs, hindering reproducibility and raising concerns about test-data leakage. To fill this gap, this dissertation proposes to design a musically multimodal benchmark built on a transparent, fully open evaluation pipeline. The benchmark will present closed-question-answer items across four musical modalities, employ carefully engineered distractor options to enforce genuine perceptual engagement, and follow rigorously documented prompt-selection and answer-extraction procedures. It will further incorporate culturally diverse musical material beyond the dominant Western canon. Guided by three research questions: (1) how to devise robust, reproducible evaluation procedures, (2) how current MLLMs perform across modalities, and (3) how model scores relate to human musical abilities; the benchmark will enable precise diagnosis of model limitations, inform the development of more musically aware AI systems, and provide a principled basis for assessing practical usefulness to musicians and other stakeholders in the creative industry."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sourada-2026-thesis">
<titleInfo>
<title>Thesis Proposal: Multimodal Benchmark for Music Understanding in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tomáš</namePart>
<namePart type="family">Sourada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Selene</namePart>
<namePart type="family">Baez Santamaria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sai</namePart>
<namePart type="given">Ashish</namePart>
<namePart type="family">Somayajula</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atsuki</namePart>
<namePart type="family">Yamaguchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-383-8</identifier>
</relatedItem>
<abstract>Music is a universal cultural practice that influences emotion, ritual and creativity, and it is now represented in many digital modalities: audio recordings, symbolic encodings (MIDI, MusicXML, ABC), visual scores and lyrics. Multimodal Large Language Models (MLLMs) have the ambition to process “everything”, including music, and therefore promise to support musical analysis, creation and education. Despite this promise, systematic methods for evaluating whether a MLLM understands music are missing. Existing music-focused benchmarks are fragmented, largely single-modality, Western-centric, and often do not require actual perception of the musical content; methodological details such as prompt design and answer-extraction are frequently omitted or not discussed, and some evaluations rely on proprietary LLMs, hindering reproducibility and raising concerns about test-data leakage. To fill this gap, this dissertation proposes to design a musically multimodal benchmark built on a transparent, fully open evaluation pipeline. The benchmark will present closed-question-answer items across four musical modalities, employ carefully engineered distractor options to enforce genuine perceptual engagement, and follow rigorously documented prompt-selection and answer-extraction procedures. It will further incorporate culturally diverse musical material beyond the dominant Western canon. Guided by three research questions: (1) how to devise robust, reproducible evaluation procedures, (2) how current MLLMs perform across modalities, and (3) how model scores relate to human musical abilities; the benchmark will enable precise diagnosis of model limitations, inform the development of more musically aware AI systems, and provide a principled basis for assessing practical usefulness to musicians and other stakeholders in the creative industry.</abstract>
<identifier type="citekey">sourada-2026-thesis</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-srw.28/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>393</start>
<end>405</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Thesis Proposal: Multimodal Benchmark for Music Understanding in Large Language Models
%A Sourada, Tomáš
%Y Baez Santamaria, Selene
%Y Somayajula, Sai Ashish
%Y Yamaguchi, Atsuki
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 4: Student Research Workshop)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-383-8
%F sourada-2026-thesis
%X Music is a universal cultural practice that influences emotion, ritual and creativity, and it is now represented in many digital modalities: audio recordings, symbolic encodings (MIDI, MusicXML, ABC), visual scores and lyrics. Multimodal Large Language Models (MLLMs) have the ambition to process “everything”, including music, and therefore promise to support musical analysis, creation and education. Despite this promise, systematic methods for evaluating whether a MLLM understands music are missing. Existing music-focused benchmarks are fragmented, largely single-modality, Western-centric, and often do not require actual perception of the musical content; methodological details such as prompt design and answer-extraction are frequently omitted or not discussed, and some evaluations rely on proprietary LLMs, hindering reproducibility and raising concerns about test-data leakage. To fill this gap, this dissertation proposes to design a musically multimodal benchmark built on a transparent, fully open evaluation pipeline. The benchmark will present closed-question-answer items across four musical modalities, employ carefully engineered distractor options to enforce genuine perceptual engagement, and follow rigorously documented prompt-selection and answer-extraction procedures. It will further incorporate culturally diverse musical material beyond the dominant Western canon. Guided by three research questions: (1) how to devise robust, reproducible evaluation procedures, (2) how current MLLMs perform across modalities, and (3) how model scores relate to human musical abilities; the benchmark will enable precise diagnosis of model limitations, inform the development of more musically aware AI systems, and provide a principled basis for assessing practical usefulness to musicians and other stakeholders in the creative industry.
%U https://aclanthology.org/2026.eacl-srw.28/
%P 393-405
Markdown (Informal)
[Thesis Proposal: Multimodal Benchmark for Music Understanding in Large Language Models](https://aclanthology.org/2026.eacl-srw.28/) (Sourada, EACL 2026)
ACL