@inproceedings{berendes-etal-2024-lyrics,
title = "Lyrics Transcription in Western Classical Music with Whisper: A Case Study on Schubert`s Winterreise",
author = {Berendes, Hans-Ulrich and
Schw{\"a}r, Simon and
M{\"u}ller, Meinard},
editor = "Kruspe, Anna and
Oramas, Sergio and
Epure, Elena V. and
Sordo, Mohamed and
Weck, Benno and
Doh, SeungHeon and
Won, Minz and
Manco, Ilaria and
Meseguer-Brocal, Gabriel",
booktitle = "Proceedings of the 3rd Workshop on NLP for Music and Audio (NLP4MusA)",
month = nov,
year = "2024",
address = "Oakland, USA",
publisher = "Association for Computational Lingustics",
url = "https://aclanthology.org/2024.nlp4musa-1.3/",
pages = "11--16",
abstract = "Automatic Lyrics Transcription (ALT) aims to transcribe sung words from music recordings and is closely related to Automatic Speech Recognition (ASR). Although not specifically designed for lyrics transcription, the state-of-the-art ASR model Whisper has recently proven effective for ALT and various related tasks in music information retrieval (MIR). This paper investigates Whisper`s performance on Western classical music, using the {\textquotedblleft}Schubert Winterreise Dataset.{\textquotedblright} In particular, we found that the average Word Error Rate (WER) with the unmodified Whisper model is 0.56 for this dataset, while the performance varies greatly across songs and versions. In contrast, spoken versions of the song lyrics, which we recorded, are transcribed with a WER of 0.14. Further systematic experiments with source separation and time-scale modification techniques indicate that Whisper`s accuracy in lyrics transcription is less affected by the musical accompaniment and more by the singing style."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="berendes-etal-2024-lyrics">
<titleInfo>
<title>Lyrics Transcription in Western Classical Music with Whisper: A Case Study on Schubert‘s Winterreise</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hans-Ulrich</namePart>
<namePart type="family">Berendes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Schwär</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meinard</namePart>
<namePart type="family">Müller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Workshop on NLP for Music and Audio (NLP4MusA)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Kruspe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergio</namePart>
<namePart type="family">Oramas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="given">V</namePart>
<namePart type="family">Epure</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Sordo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benno</namePart>
<namePart type="family">Weck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">SeungHeon</namePart>
<namePart type="family">Doh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minz</namePart>
<namePart type="family">Won</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ilaria</namePart>
<namePart type="family">Manco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Meseguer-Brocal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Lingustics</publisher>
<place>
<placeTerm type="text">Oakland, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic Lyrics Transcription (ALT) aims to transcribe sung words from music recordings and is closely related to Automatic Speech Recognition (ASR). Although not specifically designed for lyrics transcription, the state-of-the-art ASR model Whisper has recently proven effective for ALT and various related tasks in music information retrieval (MIR). This paper investigates Whisper‘s performance on Western classical music, using the “Schubert Winterreise Dataset.” In particular, we found that the average Word Error Rate (WER) with the unmodified Whisper model is 0.56 for this dataset, while the performance varies greatly across songs and versions. In contrast, spoken versions of the song lyrics, which we recorded, are transcribed with a WER of 0.14. Further systematic experiments with source separation and time-scale modification techniques indicate that Whisper‘s accuracy in lyrics transcription is less affected by the musical accompaniment and more by the singing style.</abstract>
<identifier type="citekey">berendes-etal-2024-lyrics</identifier>
<location>
<url>https://aclanthology.org/2024.nlp4musa-1.3/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>11</start>
<end>16</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Lyrics Transcription in Western Classical Music with Whisper: A Case Study on Schubert‘s Winterreise
%A Berendes, Hans-Ulrich
%A Schwär, Simon
%A Müller, Meinard
%Y Kruspe, Anna
%Y Oramas, Sergio
%Y Epure, Elena V.
%Y Sordo, Mohamed
%Y Weck, Benno
%Y Doh, SeungHeon
%Y Won, Minz
%Y Manco, Ilaria
%Y Meseguer-Brocal, Gabriel
%S Proceedings of the 3rd Workshop on NLP for Music and Audio (NLP4MusA)
%D 2024
%8 November
%I Association for Computational Lingustics
%C Oakland, USA
%F berendes-etal-2024-lyrics
%X Automatic Lyrics Transcription (ALT) aims to transcribe sung words from music recordings and is closely related to Automatic Speech Recognition (ASR). Although not specifically designed for lyrics transcription, the state-of-the-art ASR model Whisper has recently proven effective for ALT and various related tasks in music information retrieval (MIR). This paper investigates Whisper‘s performance on Western classical music, using the “Schubert Winterreise Dataset.” In particular, we found that the average Word Error Rate (WER) with the unmodified Whisper model is 0.56 for this dataset, while the performance varies greatly across songs and versions. In contrast, spoken versions of the song lyrics, which we recorded, are transcribed with a WER of 0.14. Further systematic experiments with source separation and time-scale modification techniques indicate that Whisper‘s accuracy in lyrics transcription is less affected by the musical accompaniment and more by the singing style.
%U https://aclanthology.org/2024.nlp4musa-1.3/
%P 11-16
Markdown (Informal)
[Lyrics Transcription in Western Classical Music with Whisper: A Case Study on Schubert’s Winterreise](https://aclanthology.org/2024.nlp4musa-1.3/) (Berendes et al., NLP4MusA 2024)
ACL