@inproceedings{solberg-etal-2023-large,
title = "A Large {N}orwegian Dataset for Weak Supervision {ASR}",
author = "Solberg, Per Erik and
Beauguitte, Pierre and
Kummervold, Per Egil and
Wetjen, Freddy",
editor = "Ilinykh, Nikolai and
Morger, Felix and
Dann{\'e}lls, Dana and
Dobnik, Simon and
Megyesi, Be{\'a}ta and
Nivre, Joakim",
booktitle = "Proceedings of the Second Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2023)",
month = may,
year = "2023",
address = "T{\'o}rshavn, the Faroe Islands",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.resourceful-1.7",
pages = "48--52",
abstract = "With the advent of weakly supervised ASR systems like Whisper, it is possible to train ASR systems on non-verbatim transcriptions. This paper describes an effort to create a large Norwegian dataset for weakly supervised ASR from parliamentary recordings. Audio from Stortinget, the Norwegian parliament, is segmented and transcribed with an existing ASR system. An algorithm retrieves transcripts of these segments from Stortinget{'}s official proceedings using the Levenshtein edit distance between the ASR output and the proceedings text. In that way, a dataset of more than 5000 hours of transcribed speech is produced with limited human effort. Since parliamentary data is public domain, the dataset can be shared freely without any restrictions.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="solberg-etal-2023-large">
<titleInfo>
<title>A Large Norwegian Dataset for Weak Supervision ASR</title>
</titleInfo>
<name type="personal">
<namePart type="given">Per</namePart>
<namePart type="given">Erik</namePart>
<namePart type="family">Solberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Beauguitte</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Per</namePart>
<namePart type="given">Egil</namePart>
<namePart type="family">Kummervold</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Freddy</namePart>
<namePart type="family">Wetjen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2023)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikolai</namePart>
<namePart type="family">Ilinykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felix</namePart>
<namePart type="family">Morger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dana</namePart>
<namePart type="family">Dannélls</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Dobnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Beáta</namePart>
<namePart type="family">Megyesi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joakim</namePart>
<namePart type="family">Nivre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Tórshavn, the Faroe Islands</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With the advent of weakly supervised ASR systems like Whisper, it is possible to train ASR systems on non-verbatim transcriptions. This paper describes an effort to create a large Norwegian dataset for weakly supervised ASR from parliamentary recordings. Audio from Stortinget, the Norwegian parliament, is segmented and transcribed with an existing ASR system. An algorithm retrieves transcripts of these segments from Stortinget’s official proceedings using the Levenshtein edit distance between the ASR output and the proceedings text. In that way, a dataset of more than 5000 hours of transcribed speech is produced with limited human effort. Since parliamentary data is public domain, the dataset can be shared freely without any restrictions.</abstract>
<identifier type="citekey">solberg-etal-2023-large</identifier>
<location>
<url>https://aclanthology.org/2023.resourceful-1.7</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>48</start>
<end>52</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Large Norwegian Dataset for Weak Supervision ASR
%A Solberg, Per Erik
%A Beauguitte, Pierre
%A Kummervold, Per Egil
%A Wetjen, Freddy
%Y Ilinykh, Nikolai
%Y Morger, Felix
%Y Dannélls, Dana
%Y Dobnik, Simon
%Y Megyesi, Beáta
%Y Nivre, Joakim
%S Proceedings of the Second Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2023)
%D 2023
%8 May
%I Association for Computational Linguistics
%C Tórshavn, the Faroe Islands
%F solberg-etal-2023-large
%X With the advent of weakly supervised ASR systems like Whisper, it is possible to train ASR systems on non-verbatim transcriptions. This paper describes an effort to create a large Norwegian dataset for weakly supervised ASR from parliamentary recordings. Audio from Stortinget, the Norwegian parliament, is segmented and transcribed with an existing ASR system. An algorithm retrieves transcripts of these segments from Stortinget’s official proceedings using the Levenshtein edit distance between the ASR output and the proceedings text. In that way, a dataset of more than 5000 hours of transcribed speech is produced with limited human effort. Since parliamentary data is public domain, the dataset can be shared freely without any restrictions.
%U https://aclanthology.org/2023.resourceful-1.7
%P 48-52
Markdown (Informal)
[A Large Norwegian Dataset for Weak Supervision ASR](https://aclanthology.org/2023.resourceful-1.7) (Solberg et al., RESOURCEFUL 2023)
ACL
- Per Erik Solberg, Pierre Beauguitte, Per Egil Kummervold, and Freddy Wetjen. 2023. A Large Norwegian Dataset for Weak Supervision ASR. In Proceedings of the Second Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2023), pages 48–52, Tórshavn, the Faroe Islands. Association for Computational Linguistics.