@InProceedings{bobicev-mvarvanduc-perez:2017:LT4DH-CEE,
  author    = {Bobicev, Victoria  and  M\v{a}r\v{a}nduc, C\v{a}t\v{a}lina  and  Perez, Cenel Augusto},
  title     = {Tools for Building a Corpus to Study the Historical and Geographical Variation of the Romanian Language},
  booktitle = {Proceedings of the First Workshop on Language technology for Digital Humanities in Central and (South-)Eastern Europe},
  month     = {September},
  year      = {2017},
  address   = {Varna},
  publisher = {INCOMA Inc.},
  pages     = {10--19},
  abstract  = {Contemporary standard language corpora are ideal for NLP. There are few
	morphologically and syntactically annotated corpora for Romanian, and those
	existing or in progress only deal with the Contemporary Romanian standard.
	However, the necessity to study the dynamics of natural languages gave rise to
	balanced corpora, containing non-standard texts. In this paper, we describe the
	creation of tools for processing non-standard Romanian to build a big balanced
	corpus. We want to preserve in annotated form as many early stages of language
	as possible. We have already built a corpus in Old Romanian. We also intend to
	include the South-Danube dialects, remote to the standard language, along with
	regional forms closer to the standard. We try to preserve data about endangered
	idioms such as Aromanian, Meglenoromanian and Istroromanian dialects, and
	calculate the distance between different regional variants, including the
	language spoken in the Republic of Moldova. This distance, as well as the
	mutual understanding between the speakers, is the correct criterion for the
	classification of idioms as different languages, or as dialects, or as regional
	variants close to the standard.},
  url       = {http://doi.org/10.26615/978-954-452-046-5_002}
}

