@inproceedings{pradhan-etal-2024-science,
title = "My Science Tutor ({M}y{ST}){--}a Large Corpus of Children`s Conversational Speech",
author = "Pradhan, Sameer and
Cole, Ronald A. and
Ward, Wayne H.",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1052/",
pages = "12040--12045",
abstract = "This article describes the [corpus-name] corpus developed as part of the [project-name] project. To the best of our knowledge, this is one of the largest collections of children`s conversational speech that is freely available for non-commercial use under the creative commons license (CC BY-NC-SA 4.0). It comprises approximately 400 hours of speech, spanning some 230K utterances spread across about 10,500 virtual tutor sessions. Roughly 1,300 third, fourth and fifth grade students contributed to this corpus. The current release contains roughly 100K transcribed utterances. It is our hope that the corpus can be used to improve automatic speech recognition models and algorithms. We report the word error rate achieved on the test set using a model trained on the training and development portion of the corpus. The git repository of the corpus contains the complete training and evaluation setup in order to facilitate a fair and consistent evaluation. It is our hope that this corpus will contribute to the creation and evaluation of conversational AI agents having a better understanding of children`s speech, potentially opening doors to novel, effective, learning and therapeutic interventions."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pradhan-etal-2024-science">
<titleInfo>
<title>My Science Tutor (MyST)–a Large Corpus of Children‘s Conversational Speech</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sameer</namePart>
<namePart type="family">Pradhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ronald</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Cole</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wayne</namePart>
<namePart type="given">H</namePart>
<namePart type="family">Ward</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This article describes the [corpus-name] corpus developed as part of the [project-name] project. To the best of our knowledge, this is one of the largest collections of children‘s conversational speech that is freely available for non-commercial use under the creative commons license (CC BY-NC-SA 4.0). It comprises approximately 400 hours of speech, spanning some 230K utterances spread across about 10,500 virtual tutor sessions. Roughly 1,300 third, fourth and fifth grade students contributed to this corpus. The current release contains roughly 100K transcribed utterances. It is our hope that the corpus can be used to improve automatic speech recognition models and algorithms. We report the word error rate achieved on the test set using a model trained on the training and development portion of the corpus. The git repository of the corpus contains the complete training and evaluation setup in order to facilitate a fair and consistent evaluation. It is our hope that this corpus will contribute to the creation and evaluation of conversational AI agents having a better understanding of children‘s speech, potentially opening doors to novel, effective, learning and therapeutic interventions.</abstract>
<identifier type="citekey">pradhan-etal-2024-science</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1052/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>12040</start>
<end>12045</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T My Science Tutor (MyST)–a Large Corpus of Children‘s Conversational Speech
%A Pradhan, Sameer
%A Cole, Ronald A.
%A Ward, Wayne H.
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F pradhan-etal-2024-science
%X This article describes the [corpus-name] corpus developed as part of the [project-name] project. To the best of our knowledge, this is one of the largest collections of children‘s conversational speech that is freely available for non-commercial use under the creative commons license (CC BY-NC-SA 4.0). It comprises approximately 400 hours of speech, spanning some 230K utterances spread across about 10,500 virtual tutor sessions. Roughly 1,300 third, fourth and fifth grade students contributed to this corpus. The current release contains roughly 100K transcribed utterances. It is our hope that the corpus can be used to improve automatic speech recognition models and algorithms. We report the word error rate achieved on the test set using a model trained on the training and development portion of the corpus. The git repository of the corpus contains the complete training and evaluation setup in order to facilitate a fair and consistent evaluation. It is our hope that this corpus will contribute to the creation and evaluation of conversational AI agents having a better understanding of children‘s speech, potentially opening doors to novel, effective, learning and therapeutic interventions.
%U https://aclanthology.org/2024.lrec-main.1052/
%P 12040-12045
Markdown (Informal)
[My Science Tutor (MyST)–a Large Corpus of Children’s Conversational Speech](https://aclanthology.org/2024.lrec-main.1052/) (Pradhan et al., LREC-COLING 2024)
ACL