@inproceedings{parsons-etal-2025-adding,
title = "Adding Metadata to Existing Parliamentary Speech Corpus",
author = "Parsons, Phoebe and
Solberg, Per Erik and
Kvale, Knut and
Svendsen, Torbj{\o}rn and
Salvi, Giampiero",
editor = "Johansson, Richard and
Stymne, Sara",
booktitle = "Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)",
month = mar,
year = "2025",
address = "Tallinn, Estonia",
publisher = "University of Tartu Library",
url = "https://aclanthology.org/2025.nodalida-1.49/",
pages = "448--457",
ISBN = "978-9908-53-109-0",
abstract = "Parliamentary proceedings are convenient data sources for creating corpora for speech technology. Given its public nature, there is an abundance of extra information about the speakers that can be legally and ethically harvested to enrich this kind of corpora. This paper describes the methods we have used to add speaker metadata to the Stortinget Speech Corpus (SSC) containing over 5,000 hours of Norwegian speech with non-verbatim transcripts but without speaker metadata. The additional metadata for each speech segment includes speaker ID, gender, date of birth, municipality of birth, and counties represented. We also infer speaker dialect from their municipality of birth using a manually designed mapping between municipalities and Norwegian dialects. We provide observations on the SSC data and give suggestions for how it may be used for tasks other than speech recognition. Finally, we demonstrate the utility of this new metadata through a dialect identification task. The described methods can be adapted to add metadata information to parliamentary corpora in other languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="parsons-etal-2025-adding">
<titleInfo>
<title>Adding Metadata to Existing Parliamentary Speech Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Phoebe</namePart>
<namePart type="family">Parsons</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Per</namePart>
<namePart type="given">Erik</namePart>
<namePart type="family">Solberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Knut</namePart>
<namePart type="family">Kvale</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Torbjørn</namePart>
<namePart type="family">Svendsen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giampiero</namePart>
<namePart type="family">Salvi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Johansson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Stymne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>University of Tartu Library</publisher>
<place>
<placeTerm type="text">Tallinn, Estonia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-9908-53-109-0</identifier>
</relatedItem>
<abstract>Parliamentary proceedings are convenient data sources for creating corpora for speech technology. Given its public nature, there is an abundance of extra information about the speakers that can be legally and ethically harvested to enrich this kind of corpora. This paper describes the methods we have used to add speaker metadata to the Stortinget Speech Corpus (SSC) containing over 5,000 hours of Norwegian speech with non-verbatim transcripts but without speaker metadata. The additional metadata for each speech segment includes speaker ID, gender, date of birth, municipality of birth, and counties represented. We also infer speaker dialect from their municipality of birth using a manually designed mapping between municipalities and Norwegian dialects. We provide observations on the SSC data and give suggestions for how it may be used for tasks other than speech recognition. Finally, we demonstrate the utility of this new metadata through a dialect identification task. The described methods can be adapted to add metadata information to parliamentary corpora in other languages.</abstract>
<identifier type="citekey">parsons-etal-2025-adding</identifier>
<location>
<url>https://aclanthology.org/2025.nodalida-1.49/</url>
</location>
<part>
<date>2025-03</date>
<extent unit="page">
<start>448</start>
<end>457</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Adding Metadata to Existing Parliamentary Speech Corpus
%A Parsons, Phoebe
%A Solberg, Per Erik
%A Kvale, Knut
%A Svendsen, Torbjørn
%A Salvi, Giampiero
%Y Johansson, Richard
%Y Stymne, Sara
%S Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)
%D 2025
%8 March
%I University of Tartu Library
%C Tallinn, Estonia
%@ 978-9908-53-109-0
%F parsons-etal-2025-adding
%X Parliamentary proceedings are convenient data sources for creating corpora for speech technology. Given its public nature, there is an abundance of extra information about the speakers that can be legally and ethically harvested to enrich this kind of corpora. This paper describes the methods we have used to add speaker metadata to the Stortinget Speech Corpus (SSC) containing over 5,000 hours of Norwegian speech with non-verbatim transcripts but without speaker metadata. The additional metadata for each speech segment includes speaker ID, gender, date of birth, municipality of birth, and counties represented. We also infer speaker dialect from their municipality of birth using a manually designed mapping between municipalities and Norwegian dialects. We provide observations on the SSC data and give suggestions for how it may be used for tasks other than speech recognition. Finally, we demonstrate the utility of this new metadata through a dialect identification task. The described methods can be adapted to add metadata information to parliamentary corpora in other languages.
%U https://aclanthology.org/2025.nodalida-1.49/
%P 448-457
Markdown (Informal)
[Adding Metadata to Existing Parliamentary Speech Corpus](https://aclanthology.org/2025.nodalida-1.49/) (Parsons et al., NoDaLiDa 2025)
ACL
- Phoebe Parsons, Per Erik Solberg, Knut Kvale, Torbjørn Svendsen, and Giampiero Salvi. 2025. Adding Metadata to Existing Parliamentary Speech Corpus. In Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025), pages 448–457, Tallinn, Estonia. University of Tartu Library.