@inproceedings{kulebi-etal-2022-parlamentparla,
title = "{P}arlament{P}arla: A Speech Corpus of {C}atalan Parliamentary Sessions",
author = "Kulebi, Baybars and
Armentano-Oller, Carme and
Rodriguez-Penagos, Carlos and
Villegas, Marta",
booktitle = "Proceedings of the Workshop ParlaCLARIN III within the 13th Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.parlaclarin-1.18",
pages = "125--130",
abstract = "Recently, various end-to-end architectures of Automatic Speech Recognition (ASR) are being showcased as an important step towards providing language technologies to all languages instead of a select few such as English. However many languages are still suffering due to the {``}digital gap,{''} lacking thousands of hours of transcribed speech data openly accessible that is necessary to train modern ASR architectures. Although Catalan already has access to various open speech corpora, these corpora lack diversity and are limited in total volume. In order to address this lack of resources for Catalan language, in this work we present ParlamentParla, a corpus of more than 600 hours of speech from Catalan Parliament sessions. This corpus has already been used in training of state-of-the-art ASR systems, and proof-of-concept text-to-speech (TTS) models. In this work we explain in detail the pipeline that allows the information publicly available on the parliamentary website to be converted to a speech corpus compatible with training of ASR and possibly TTS models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kulebi-etal-2022-parlamentparla">
<titleInfo>
<title>ParlamentParla: A Speech Corpus of Catalan Parliamentary Sessions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Baybars</namePart>
<namePart type="family">Kulebi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carme</namePart>
<namePart type="family">Armentano-Oller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carlos</namePart>
<namePart type="family">Rodriguez-Penagos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="family">Villegas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop ParlaCLARIN III within the 13th Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recently, various end-to-end architectures of Automatic Speech Recognition (ASR) are being showcased as an important step towards providing language technologies to all languages instead of a select few such as English. However many languages are still suffering due to the “digital gap,” lacking thousands of hours of transcribed speech data openly accessible that is necessary to train modern ASR architectures. Although Catalan already has access to various open speech corpora, these corpora lack diversity and are limited in total volume. In order to address this lack of resources for Catalan language, in this work we present ParlamentParla, a corpus of more than 600 hours of speech from Catalan Parliament sessions. This corpus has already been used in training of state-of-the-art ASR systems, and proof-of-concept text-to-speech (TTS) models. In this work we explain in detail the pipeline that allows the information publicly available on the parliamentary website to be converted to a speech corpus compatible with training of ASR and possibly TTS models.</abstract>
<identifier type="citekey">kulebi-etal-2022-parlamentparla</identifier>
<location>
<url>https://aclanthology.org/2022.parlaclarin-1.18</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>125</start>
<end>130</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ParlamentParla: A Speech Corpus of Catalan Parliamentary Sessions
%A Kulebi, Baybars
%A Armentano-Oller, Carme
%A Rodriguez-Penagos, Carlos
%A Villegas, Marta
%S Proceedings of the Workshop ParlaCLARIN III within the 13th Language Resources and Evaluation Conference
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F kulebi-etal-2022-parlamentparla
%X Recently, various end-to-end architectures of Automatic Speech Recognition (ASR) are being showcased as an important step towards providing language technologies to all languages instead of a select few such as English. However many languages are still suffering due to the “digital gap,” lacking thousands of hours of transcribed speech data openly accessible that is necessary to train modern ASR architectures. Although Catalan already has access to various open speech corpora, these corpora lack diversity and are limited in total volume. In order to address this lack of resources for Catalan language, in this work we present ParlamentParla, a corpus of more than 600 hours of speech from Catalan Parliament sessions. This corpus has already been used in training of state-of-the-art ASR systems, and proof-of-concept text-to-speech (TTS) models. In this work we explain in detail the pipeline that allows the information publicly available on the parliamentary website to be converted to a speech corpus compatible with training of ASR and possibly TTS models.
%U https://aclanthology.org/2022.parlaclarin-1.18
%P 125-130
Markdown (Informal)
[ParlamentParla: A Speech Corpus of Catalan Parliamentary Sessions](https://aclanthology.org/2022.parlaclarin-1.18) (Kulebi et al., ParlaCLARIN 2022)
ACL
- Baybars Kulebi, Carme Armentano-Oller, Carlos Rodriguez-Penagos, and Marta Villegas. 2022. ParlamentParla: A Speech Corpus of Catalan Parliamentary Sessions. In Proceedings of the Workshop ParlaCLARIN III within the 13th Language Resources and Evaluation Conference, pages 125–130, Marseille, France. European Language Resources Association.