@InProceedings{lakomkin-EtAl:2018:Demos,
  author    = {Lakomkin, Egor  and  Magg, Sven  and  Weber, Cornelius  and  Wermter, Stefan},
  title     = {KT-Speech-Crawler: Automatic Dataset Construction for Speech Recognition from YouTube Videos},
  booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations},
  month     = {November},
  year      = {2018},
  address   = {Brussels, Belgium},
  publisher = {Association for Computational Linguistics},
  pages     = {90--95},
  abstract  = {We describe KT-Speech-Crawler: an approach for automatic dataset construction for speech recognition by crawling YouTube videos. We outline several filtering and post-processing steps, which extract samples that can be used for training end-to-end neural speech recognition systems. In our experiments, we demonstrate that a single-core version of the crawler can obtain around 150 hours of transcribed speech within a day, containing an estimated 3.5% word error rate in the transcriptions. Automatically collected samples contain reading and spontaneous speech recorded in various conditions including background noise and music, distant microphone recordings, and a variety of accents and reverberation. When training a deep neural network on speech recognition, we observed around 40% word error rate reduction on the Wall Street Journal dataset by integrating 200 hours of the collected samples into the training set.},
  url       = {http://www.aclweb.org/anthology/D18-2016}
}