@inproceedings{korvas-etal-2014-free,
title = "Free {E}nglish and {C}zech telephone speech corpus shared under the {CC}-{BY}-{SA} 3.0 license",
author = "Korvas, Mat{\v{e}}j and
Pl{\'a}tek, Ond{\v{r}}ej and
Du{\v{s}}ek, Ond{\v{r}}ej and
{\v{Z}}ilka, Luk{\'a}{\v{s}} and
Jur{\v{c}}{\'\i}{\v{c}}ek, Filip",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Loftsson, Hrafn and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)",
month = may,
year = "2014",
address = "Reykjavik, Iceland",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2014/pdf/535_Paper.pdf",
pages = "4423--4428",
abstract = "We present a dataset of telephone conversations in English and Czech, developed for training acoustic models for automatic speech recognition (ASR) in spoken dialogue systems (SDSs). The data comprise 45 hours of speech in English and over 18 hours in Czech. Large part of the data, both audio and transcriptions, was collected using crowdsourcing, the rest are transcriptions by hired transcribers. We release the data together with scripts for data pre-processing and building acoustic models using the HTK and Kaldi ASR toolkits. We publish also the trained models described in this paper. The data are released under the CC-BY-SA 3.0 license, the scripts are licensed under Apache 2.0. In the paper, we report on the methodology of collecting the data, on the size and properties of the data, and on the scripts and their use. We verify the usability of the datasets by training and evaluating acoustic models using the presented data and scripts.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="korvas-etal-2014-free">
<titleInfo>
<title>Free English and Czech telephone speech corpus shared under the CC-BY-SA 3.0 license</title>
</titleInfo>
<name type="personal">
<namePart type="given">Matěj</namePart>
<namePart type="family">Korvas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Plátek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Dušek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lukáš</namePart>
<namePart type="family">Žilka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Filip</namePart>
<namePart type="family">Jurčíček</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2014-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hrafn</namePart>
<namePart type="family">Loftsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Reykjavik, Iceland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a dataset of telephone conversations in English and Czech, developed for training acoustic models for automatic speech recognition (ASR) in spoken dialogue systems (SDSs). The data comprise 45 hours of speech in English and over 18 hours in Czech. Large part of the data, both audio and transcriptions, was collected using crowdsourcing, the rest are transcriptions by hired transcribers. We release the data together with scripts for data pre-processing and building acoustic models using the HTK and Kaldi ASR toolkits. We publish also the trained models described in this paper. The data are released under the CC-BY-SA 3.0 license, the scripts are licensed under Apache 2.0. In the paper, we report on the methodology of collecting the data, on the size and properties of the data, and on the scripts and their use. We verify the usability of the datasets by training and evaluating acoustic models using the presented data and scripts.</abstract>
<identifier type="citekey">korvas-etal-2014-free</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2014/pdf/535_Paper.pdf</url>
</location>
<part>
<date>2014-05</date>
<extent unit="page">
<start>4423</start>
<end>4428</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Free English and Czech telephone speech corpus shared under the CC-BY-SA 3.0 license
%A Korvas, Matěj
%A Plátek, Ondřej
%A Dušek, Ondřej
%A Žilka, Lukáš
%A Jurčíček, Filip
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Loftsson, Hrafn
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)
%D 2014
%8 May
%I European Language Resources Association (ELRA)
%C Reykjavik, Iceland
%F korvas-etal-2014-free
%X We present a dataset of telephone conversations in English and Czech, developed for training acoustic models for automatic speech recognition (ASR) in spoken dialogue systems (SDSs). The data comprise 45 hours of speech in English and over 18 hours in Czech. Large part of the data, both audio and transcriptions, was collected using crowdsourcing, the rest are transcriptions by hired transcribers. We release the data together with scripts for data pre-processing and building acoustic models using the HTK and Kaldi ASR toolkits. We publish also the trained models described in this paper. The data are released under the CC-BY-SA 3.0 license, the scripts are licensed under Apache 2.0. In the paper, we report on the methodology of collecting the data, on the size and properties of the data, and on the scripts and their use. We verify the usability of the datasets by training and evaluating acoustic models using the presented data and scripts.
%U http://www.lrec-conf.org/proceedings/lrec2014/pdf/535_Paper.pdf
%P 4423-4428
Markdown (Informal)
[Free English and Czech telephone speech corpus shared under the CC-BY-SA 3.0 license](http://www.lrec-conf.org/proceedings/lrec2014/pdf/535_Paper.pdf) (Korvas et al., LREC 2014)
ACL