@InProceedings{bougrine-EtAl:2017:W17-13,
  author    = {Bougrine, Soumia  and  Chorana, Aicha  and  Lakhdari, Abdallah  and  Cherroun, Hadda},
  title     = {Toward a Web-based Speech Corpus for Algerian Dialectal Arabic Varieties},
  booktitle = {Proceedings of the Third Arabic Natural Language Processing Workshop},
  month     = {April},
  year      = {2017},
  address   = {Valencia, Spain},
  publisher = {Association for Computational Linguistics},
  pages     = {138--146},
  abstract  = {The success of machine learning for automatic
	speech processing has raised the
	need for large scale datasets. However,
	collecting such data is often a challenging
	task as it implies significant investment involving
	time and money cost. In this paper,
	we devise a recipe for building largescale
	Speech Corpora by harnessing Web
	resources namely YouTube, other Social
	Media, Online Radio and TV. We illustrate
	our methodology by building KALAM’DZ,
	An Arabic Spoken corpus dedicated to Algerian
	dialectal varieties. The preliminary
	version of our dataset covers all major Algerian
	dialects. In addition, we make sure
	that this material takes into account numerous
	aspects that foster its richness. In
	fact, we have targeted various speech topics.
	Some automatic and manual annotations
	are provided. They gather useful
	information related to the speakers and
	sub-dialect information at the utterance
	level. Our corpus encompasses the 8 major
	Algerian Arabic sub-dialects with 4881
	speakers and more than 104.4 hours segmented
	in utterances of at least 6 s.},
  url       = {http://www.aclweb.org/anthology/W17-1317}
}

