@inproceedings{au-2026-midi,
title = "{MIDI}-{PHOR}: Multi-View Distillation for Music Understanding and Captioning",
author = "Au, Steven",
editor = "Epure, Elena V. and
Oramas, Sergio and
Doh, SeungHeon and
Ramoneda, Pedro and
Kruspe, Anna and
Sordo, Mohamed",
booktitle = "Proceedings of the 4th Workshop on {NLP} for Music and Audio ({NLP}4{M}us{A} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.nlp4musa-1.6/",
doi = "10.18653/v1/2026.nlp4musa-1.6",
pages = "33--43",
ISBN = "979-8-89176-369-2",
abstract = "A central limitation of current music understanding frameworks is the reliance on audio embeddings, which frequently yields interpretations lacking traceable ties to explicit musical elements such as notes, dynamics, and instrumentation. We address this gap with MIDIPHOR, a MIDI-first framework that converts symbolic data into structured, queryable representations for reasoning. MIDI-PHOR distills each piece into three complementary views: a symbolic view capturing pitch, meter, and key; a time-series (TS) view that tracks rhythmic salience, texture, and role activity; and an instrument-role graph encoding ensemble interactions. With evidence-linked claims, experiments demonstrate reduced hallucinations compared to raw-MIDI baselines and offer a robust, auditable bridge between symbolic data and semantic music understanding."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="au-2026-midi">
<titleInfo>
<title>MIDI-PHOR: Multi-View Distillation for Music Understanding and Captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Au</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on NLP for Music and Audio (NLP4MusA 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="given">V</namePart>
<namePart type="family">Epure</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergio</namePart>
<namePart type="family">Oramas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">SeungHeon</namePart>
<namePart type="family">Doh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pedro</namePart>
<namePart type="family">Ramoneda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Kruspe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Sordo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-369-2</identifier>
</relatedItem>
<abstract>A central limitation of current music understanding frameworks is the reliance on audio embeddings, which frequently yields interpretations lacking traceable ties to explicit musical elements such as notes, dynamics, and instrumentation. We address this gap with MIDIPHOR, a MIDI-first framework that converts symbolic data into structured, queryable representations for reasoning. MIDI-PHOR distills each piece into three complementary views: a symbolic view capturing pitch, meter, and key; a time-series (TS) view that tracks rhythmic salience, texture, and role activity; and an instrument-role graph encoding ensemble interactions. With evidence-linked claims, experiments demonstrate reduced hallucinations compared to raw-MIDI baselines and offer a robust, auditable bridge between symbolic data and semantic music understanding.</abstract>
<identifier type="citekey">au-2026-midi</identifier>
<identifier type="doi">10.18653/v1/2026.nlp4musa-1.6</identifier>
<location>
<url>https://aclanthology.org/2026.nlp4musa-1.6/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>33</start>
<end>43</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MIDI-PHOR: Multi-View Distillation for Music Understanding and Captioning
%A Au, Steven
%Y Epure, Elena V.
%Y Oramas, Sergio
%Y Doh, SeungHeon
%Y Ramoneda, Pedro
%Y Kruspe, Anna
%Y Sordo, Mohamed
%S Proceedings of the 4th Workshop on NLP for Music and Audio (NLP4MusA 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-369-2
%F au-2026-midi
%X A central limitation of current music understanding frameworks is the reliance on audio embeddings, which frequently yields interpretations lacking traceable ties to explicit musical elements such as notes, dynamics, and instrumentation. We address this gap with MIDIPHOR, a MIDI-first framework that converts symbolic data into structured, queryable representations for reasoning. MIDI-PHOR distills each piece into three complementary views: a symbolic view capturing pitch, meter, and key; a time-series (TS) view that tracks rhythmic salience, texture, and role activity; and an instrument-role graph encoding ensemble interactions. With evidence-linked claims, experiments demonstrate reduced hallucinations compared to raw-MIDI baselines and offer a robust, auditable bridge between symbolic data and semantic music understanding.
%R 10.18653/v1/2026.nlp4musa-1.6
%U https://aclanthology.org/2026.nlp4musa-1.6/
%U https://doi.org/10.18653/v1/2026.nlp4musa-1.6
%P 33-43
Markdown (Informal)
[MIDI-PHOR: Multi-View Distillation for Music Understanding and Captioning](https://aclanthology.org/2026.nlp4musa-1.6/) (Au, NLP4MusA 2026)
ACL