@InProceedings{dhondt-grouin-grau:2017:I17-1,
  author    = {D'hondt, Eva  and  Grouin, Cyril  and  Grau, Brigitte},
  title     = {Generating a Training Corpus for OCR Post-Correction Using Encoder-Decoder Model},
  booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers)},
  month     = {November},
  year      = {2017},
  address   = {Taipei, Taiwan},
  publisher = {Asian Federation of Natural Language Processing},
  pages     = {1006--1014},
  abstract  = {In this paper we present a novel approach to the automatic correction of
	OCR-induced orthographic errors in a given text. While current systems depend
	heavily on large training corpora or external information, such as
	domain-specific lexicons or confidence scores from the OCR process, our system
	only requires a small amount of (relatively) clean training data from a
	representative corpus to learn a character-based statistical language model
	using Bidirectional Long Short-Term Memory Networks (biLSTMs). We demonstrate
	the versatility and adaptability of our system on different text corpora with
	varying degrees of textual noise, including a real-life OCR corpus in the
	medical domain.},
  url       = {http://www.aclweb.org/anthology/I17-1101}
}

