@InProceedings{xu-koehn:2017:EMNLP2017,
  author    = {Xu, Hainan  and  Koehn, Philipp},
  title     = {Zipporah: a Fast and Scalable Data Cleaning System for Noisy Web-Crawled Parallel Corpora},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2945--2950},
  abstract  = {We introduce Zipporah, a fast and scalable data cleaning system. We propose a
	novel type of bag-of-words translation feature, and train logistic regression
	models to classify good data and synthetic noisy data in the proposed feature
	space. The trained model is used to score parallel sentences in the data pool
	for selection. As shown in experiments, Zipporah selects a high-quality
	parallel corpus from a large, mixed quality data pool. In particular, for one
	noisy dataset, Zipporah achieves a 2.1 BLEU score improvement with using 1/5 of
	the data over using the entire corpus.},
  url       = {https://www.aclweb.org/anthology/D17-1319}
}