@inproceedings{duriskova-etal-2024-khan,
title = "Khan Academy Corpus: A Multilingual Corpus of Khan Academy Lectures",
author = "{\v{D}}uri{\v{s}}kov{\'a}, Dominika and
Jur{\'a}{\v{s}}ov{\'a}, Daniela and
{\v{Z}}ilinec, Mat{\'u}{\v{s}} and
{\v{S}}ubert, Eduard and
Bojar, Ond{\v{r}}ej",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.851",
pages = "9743--9752",
abstract = "We present the Khan Academy Corpus totalling 10122 hours in 87394 recordings across 29 languages, where 43{\%} of recordings (4252 hours) are equipped with human-written subtitles. The subtitle texts cover a total of 137 languages. The dataset was collected from open access Khan Academy lectures, benefiting from their manual transcripts and manual translations of the transcripts. The dataset can serve in creation or evaluation of multilingual speech recognition or translation systems, featuring a diverse set of subject domains.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="duriskova-etal-2024-khan">
<titleInfo>
<title>Khan Academy Corpus: A Multilingual Corpus of Khan Academy Lectures</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dominika</namePart>
<namePart type="family">Ďurišková</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniela</namePart>
<namePart type="family">Jurášová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matúš</namePart>
<namePart type="family">Žilinec</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eduard</namePart>
<namePart type="family">Šubert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Bojar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present the Khan Academy Corpus totalling 10122 hours in 87394 recordings across 29 languages, where 43% of recordings (4252 hours) are equipped with human-written subtitles. The subtitle texts cover a total of 137 languages. The dataset was collected from open access Khan Academy lectures, benefiting from their manual transcripts and manual translations of the transcripts. The dataset can serve in creation or evaluation of multilingual speech recognition or translation systems, featuring a diverse set of subject domains.</abstract>
<identifier type="citekey">duriskova-etal-2024-khan</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.851</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>9743</start>
<end>9752</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Khan Academy Corpus: A Multilingual Corpus of Khan Academy Lectures
%A Ďurišková, Dominika
%A Jurášová, Daniela
%A Žilinec, Matúš
%A Šubert, Eduard
%A Bojar, Ondřej
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F duriskova-etal-2024-khan
%X We present the Khan Academy Corpus totalling 10122 hours in 87394 recordings across 29 languages, where 43% of recordings (4252 hours) are equipped with human-written subtitles. The subtitle texts cover a total of 137 languages. The dataset was collected from open access Khan Academy lectures, benefiting from their manual transcripts and manual translations of the transcripts. The dataset can serve in creation or evaluation of multilingual speech recognition or translation systems, featuring a diverse set of subject domains.
%U https://aclanthology.org/2024.lrec-main.851
%P 9743-9752
Markdown (Informal)
[Khan Academy Corpus: A Multilingual Corpus of Khan Academy Lectures](https://aclanthology.org/2024.lrec-main.851) (Ďurišková et al., LREC-COLING 2024)
ACL
- Dominika Ďurišková, Daniela Jurášová, Matúš Žilinec, Eduard Šubert, and Ondřej Bojar. 2024. Khan Academy Corpus: A Multilingual Corpus of Khan Academy Lectures. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 9743–9752, Torino, Italia. ELRA and ICCL.