@inproceedings{esteve-etal-2010-epac,
title = "The {EPAC} Corpus: Manual and Automatic Annotations of Conversational Speech in {F}rench Broadcast News",
author = "Est{\`e}ve, Yannick and
Bazillon, Thierry and
Antoine, Jean-Yves and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Farinas, J{\'e}r{\^o}me",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Rosner, Mike and
Tapias, Daniel",
booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}'10)",
month = may,
year = "2010",
address = "Valletta, Malta",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2010/pdf/650_Paper.pdf",
abstract = "This paper presents the EPAC corpus which is composed by a set of 100 hours of conversational speech manually transcribed and by the outputs of automatic tools (automatic segmentation, transcription, POS tagging, etc.) applied on the entire French ESTER 1 audio corpus: this concerns about 1700 hours of audio recordings from radiophonic shows. This corpus was built during the EPAC project funded by the French Research Agency (ANR) from 2007 to 2010. This corpus increases significantly the amount of French manually transcribed audio recordings easily available and it is now included as a part of the ESTER 1 corpus in the ELRA catalog without additional cost. By providing a large set of automatic outputs of speech processing tools, the EPAC corpus should be useful to researchers who want to work on such data without having to develop and deal with such tools. These automatic annotations are various: segmentation and speaker diarization, one-best hypotheses from the LIUM automatic speech recognition system with confidence measures, but also word-lattices and confusion networks, named entities, part-of-speech tags, chunks, etc. The 100 hours of speech manually transcribed were split into three data sets in order to get an official training corpus, an official development corpus and an official test corpus. These data sets were used to develop and to evaluate some automatic tools which have been used to process the 1700 hours of audio recording. For example, on the EPAC test data set our ASR system yields a word error rate equals to 17.25{\%}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="esteve-etal-2010-epac">
<titleInfo>
<title>The EPAC Corpus: Manual and Automatic Annotations of Conversational Speech in French Broadcast News</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yannick</namePart>
<namePart type="family">Estève</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Bazillon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jean-Yves</namePart>
<namePart type="family">Antoine</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jérôme</namePart>
<namePart type="family">Farinas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Rosner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Valletta, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents the EPAC corpus which is composed by a set of 100 hours of conversational speech manually transcribed and by the outputs of automatic tools (automatic segmentation, transcription, POS tagging, etc.) applied on the entire French ESTER 1 audio corpus: this concerns about 1700 hours of audio recordings from radiophonic shows. This corpus was built during the EPAC project funded by the French Research Agency (ANR) from 2007 to 2010. This corpus increases significantly the amount of French manually transcribed audio recordings easily available and it is now included as a part of the ESTER 1 corpus in the ELRA catalog without additional cost. By providing a large set of automatic outputs of speech processing tools, the EPAC corpus should be useful to researchers who want to work on such data without having to develop and deal with such tools. These automatic annotations are various: segmentation and speaker diarization, one-best hypotheses from the LIUM automatic speech recognition system with confidence measures, but also word-lattices and confusion networks, named entities, part-of-speech tags, chunks, etc. The 100 hours of speech manually transcribed were split into three data sets in order to get an official training corpus, an official development corpus and an official test corpus. These data sets were used to develop and to evaluate some automatic tools which have been used to process the 1700 hours of audio recording. For example, on the EPAC test data set our ASR system yields a word error rate equals to 17.25%.</abstract>
<identifier type="citekey">esteve-etal-2010-epac</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2010/pdf/650_Paper.pdf</url>
</location>
<part>
<date>2010-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The EPAC Corpus: Manual and Automatic Annotations of Conversational Speech in French Broadcast News
%A Estève, Yannick
%A Bazillon, Thierry
%A Antoine, Jean-Yves
%A Béchet, Frédéric
%A Farinas, Jérôme
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Rosner, Mike
%Y Tapias, Daniel
%S Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)
%D 2010
%8 May
%I European Language Resources Association (ELRA)
%C Valletta, Malta
%F esteve-etal-2010-epac
%X This paper presents the EPAC corpus which is composed by a set of 100 hours of conversational speech manually transcribed and by the outputs of automatic tools (automatic segmentation, transcription, POS tagging, etc.) applied on the entire French ESTER 1 audio corpus: this concerns about 1700 hours of audio recordings from radiophonic shows. This corpus was built during the EPAC project funded by the French Research Agency (ANR) from 2007 to 2010. This corpus increases significantly the amount of French manually transcribed audio recordings easily available and it is now included as a part of the ESTER 1 corpus in the ELRA catalog without additional cost. By providing a large set of automatic outputs of speech processing tools, the EPAC corpus should be useful to researchers who want to work on such data without having to develop and deal with such tools. These automatic annotations are various: segmentation and speaker diarization, one-best hypotheses from the LIUM automatic speech recognition system with confidence measures, but also word-lattices and confusion networks, named entities, part-of-speech tags, chunks, etc. The 100 hours of speech manually transcribed were split into three data sets in order to get an official training corpus, an official development corpus and an official test corpus. These data sets were used to develop and to evaluate some automatic tools which have been used to process the 1700 hours of audio recording. For example, on the EPAC test data set our ASR system yields a word error rate equals to 17.25%.
%U http://www.lrec-conf.org/proceedings/lrec2010/pdf/650_Paper.pdf
Markdown (Informal)
[The EPAC Corpus: Manual and Automatic Annotations of Conversational Speech in French Broadcast News](http://www.lrec-conf.org/proceedings/lrec2010/pdf/650_Paper.pdf) (Estève et al., LREC 2010)
ACL