@InProceedings{malahov-mvarvanduc-colesnicov:2017:LT4DH-CEE,
  author    = {Malahov, Ludmila  and  M\v{a}r\v{a}nduc, C\v{a}t\v{a}lina  and  Colesnicov, Alexandru},
  title     = {A Diachronic Corpus for Romanian (RoDia)},
  booktitle = {Proceedings of the First Workshop on Language technology for Digital Humanities in Central and (South-)Eastern Europe},
  month     = {September},
  year      = {2017},
  address   = {Varna},
  publisher = {INCOMA Inc.},
  pages     = {1--9},
  abstract  = {This paper describes a Romanian Dependency Treebank, built at the Al. I. Cuza
	University (UAIC), and a special OCR techniques used to build it. The corpus
	has rich morphological and syntactic annotation. There are few annotated
	representative corpora in Romanian, and the existent ones are mainly focused on
	the contemporary Romanian standard. The corpus described below is focused on
	the non-standard aspects of the language, the Regional and the Old Romanian.
	Having the intention to participate at the PROIEL project, which aligns oldest
	New Testaments, we annotate the first printed Romanian New Testament (Alba
	Iulia, 1648). We began by applying the UAIC tools for the morphological and
	syntactic processing of Contemporary Romanian over the book’s first quarter
	(second edition). By carefully manually correcting the result of the automated
	annotation (having a modest accuracy) we obtained a sub-corpus for the training
	of tools for the Old Romanian processing. But the first edition of the New
	Testament is written in Cyrillic letters. The existence of books printed in the
	Old Cyrillic alphabet is a common problem for Romania and The Republic of
	Moldova, countries where the Romanian is spoken; a problem to solve by the
	joint efforts of the NLP researchers in the two countries.},
  url       = {http://doi.org/10.26615/978-954-452-046-5_001}
}

