@inproceedings{lakomkin-etal-2018-kt,
title = "{KT}-Speech-Crawler: Automatic Dataset Construction for Speech Recognition from {Y}ou{T}ube Videos",
author = "Lakomkin, Egor and
Magg, Sven and
Weber, Cornelius and
Wermter, Stefan",
editor = "Blanco, Eduardo and
Lu, Wei",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D18-2016",
doi = "10.18653/v1/D18-2016",
pages = "90--95",
abstract = "We describe KT-Speech-Crawler: an approach for automatic dataset construction for speech recognition by crawling YouTube videos. We outline several filtering and post-processing steps, which extract samples that can be used for training end-to-end neural speech recognition systems. In our experiments, we demonstrate that a single-core version of the crawler can obtain around 150 hours of transcribed speech within a day, containing an estimated 3.5{\%} word error rate in the transcriptions. Automatically collected samples contain reading and spontaneous speech recorded in various conditions including background noise and music, distant microphone recordings, and a variety of accents and reverberation. When training a deep neural network on speech recognition, we observed around 40{\%} word error rate reduction on the Wall Street Journal dataset by integrating 200 hours of the collected samples into the training set.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lakomkin-etal-2018-kt">
<titleInfo>
<title>KT-Speech-Crawler: Automatic Dataset Construction for Speech Recognition from YouTube Videos</title>
</titleInfo>
<name type="personal">
<namePart type="given">Egor</namePart>
<namePart type="family">Lakomkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sven</namePart>
<namePart type="family">Magg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cornelius</namePart>
<namePart type="family">Weber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefan</namePart>
<namePart type="family">Wermter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eduardo</namePart>
<namePart type="family">Blanco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Brussels, Belgium</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We describe KT-Speech-Crawler: an approach for automatic dataset construction for speech recognition by crawling YouTube videos. We outline several filtering and post-processing steps, which extract samples that can be used for training end-to-end neural speech recognition systems. In our experiments, we demonstrate that a single-core version of the crawler can obtain around 150 hours of transcribed speech within a day, containing an estimated 3.5% word error rate in the transcriptions. Automatically collected samples contain reading and spontaneous speech recorded in various conditions including background noise and music, distant microphone recordings, and a variety of accents and reverberation. When training a deep neural network on speech recognition, we observed around 40% word error rate reduction on the Wall Street Journal dataset by integrating 200 hours of the collected samples into the training set.</abstract>
<identifier type="citekey">lakomkin-etal-2018-kt</identifier>
<identifier type="doi">10.18653/v1/D18-2016</identifier>
<location>
<url>https://aclanthology.org/D18-2016</url>
</location>
<part>
<date>2018-11</date>
<extent unit="page">
<start>90</start>
<end>95</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T KT-Speech-Crawler: Automatic Dataset Construction for Speech Recognition from YouTube Videos
%A Lakomkin, Egor
%A Magg, Sven
%A Weber, Cornelius
%A Wermter, Stefan
%Y Blanco, Eduardo
%Y Lu, Wei
%S Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations
%D 2018
%8 November
%I Association for Computational Linguistics
%C Brussels, Belgium
%F lakomkin-etal-2018-kt
%X We describe KT-Speech-Crawler: an approach for automatic dataset construction for speech recognition by crawling YouTube videos. We outline several filtering and post-processing steps, which extract samples that can be used for training end-to-end neural speech recognition systems. In our experiments, we demonstrate that a single-core version of the crawler can obtain around 150 hours of transcribed speech within a day, containing an estimated 3.5% word error rate in the transcriptions. Automatically collected samples contain reading and spontaneous speech recorded in various conditions including background noise and music, distant microphone recordings, and a variety of accents and reverberation. When training a deep neural network on speech recognition, we observed around 40% word error rate reduction on the Wall Street Journal dataset by integrating 200 hours of the collected samples into the training set.
%R 10.18653/v1/D18-2016
%U https://aclanthology.org/D18-2016
%U https://doi.org/10.18653/v1/D18-2016
%P 90-95
Markdown (Informal)
[KT-Speech-Crawler: Automatic Dataset Construction for Speech Recognition from YouTube Videos](https://aclanthology.org/D18-2016) (Lakomkin et al., EMNLP 2018)
ACL