@InProceedings{schulz-kuhn:2017:EMNLP2017,
  author    = {Schulz, Sarah  and  Kuhn, Jonas},
  title     = {Multi-modular domain-tailored OCR post-correction},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {2716--2726},
  abstract  = {One of the main obstacles for many Digital Humanities projects is the low data
	availability. Texts have to be digitized in an expensive and time consuming
	process whereas Optical Character Recognition (OCR) post-correction is one of
	the time-critical factors. At the example of OCR post-correction, we show the
	adaptation of a generic system to solve a specific problem with little data.
	The system accounts for a diversity of errors encountered in OCRed texts coming
	from different time periods in the domain of literature. We show that the
	combination of different approaches, such as e.g. Statistical Machine
	Translation and spell checking, with the help of a ranking mechanism
	tremendously improves over single-handed approaches. Since we consider the
	accessibility of the resulting tool as
	a crucial part of Digital Humanities collaborations, we describe the workflow
	we suggest for efficient text recognition and subsequent automatic and manual
	post-correction},
  url       = {https://www.aclweb.org/anthology/D17-1288}
}

