@inproceedings{malahov-etal-2017-diachronic,
title = "A Diachronic Corpus for {R}omanian ({R}o{D}ia)",
author = "Malahov, Ludmila and
M{\u{a}}r{\u{a}}nduc, C{\u{a}}t{\u{a}}lina and
Colesnicov, Alexandru",
editor = "Dinu, Anca and
Osenova, Petya and
Vertan, Cristina",
booktitle = "Proceedings of the First Workshop on Language technology for Digital Humanities in Central and (South-)Eastern {E}urope",
month = sep,
year = "2017",
address = "Varna",
publisher = "INCOMA Inc.",
url = "http://doi.org/10.26615/978-954-452-046-5_001",
doi = "0.26615/978-954-452-046-5_001",
pages = "1--9",
abstract = "This paper describes a Romanian Dependency Treebank, built at the Al. I. Cuza University (UAIC), and a special OCR techniques used to build it. The corpus has rich morphological and syntactic annotation. There are few annotated representative corpora in Romanian, and the existent ones are mainly focused on the contemporary Romanian standard. The corpus described below is focused on the non-standard aspects of the language, the Regional and the Old Romanian. Having the intention to participate at the PROIEL project, which aligns oldest New Testaments, we annotate the first printed Romanian New Testament (Alba Iulia, 1648). We began by applying the UAIC tools for the morphological and syntactic processing of Contemporary Romanian over the book{'}s first quarter (second edition). By carefully manually correcting the result of the automated annotation (having a modest accuracy) we obtained a sub-corpus for the training of tools for the Old Romanian processing. But the first edition of the New Testament is written in Cyrillic letters. The existence of books printed in the Old Cyrillic alphabet is a common problem for Romania and The Republic of Moldova, countries where the Romanian is spoken; a problem to solve by the joint efforts of the NLP researchers in the two countries.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="malahov-etal-2017-diachronic">
<titleInfo>
<title>A Diachronic Corpus for Romanian (RoDia)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ludmila</namePart>
<namePart type="family">Malahov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cătălina</namePart>
<namePart type="family">Mărănduc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandru</namePart>
<namePart type="family">Colesnicov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Language technology for Digital Humanities in Central and (South-)Eastern Europe</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anca</namePart>
<namePart type="family">Dinu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Petya</namePart>
<namePart type="family">Osenova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cristina</namePart>
<namePart type="family">Vertan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Inc.</publisher>
<place>
<placeTerm type="text">Varna</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes a Romanian Dependency Treebank, built at the Al. I. Cuza University (UAIC), and a special OCR techniques used to build it. The corpus has rich morphological and syntactic annotation. There are few annotated representative corpora in Romanian, and the existent ones are mainly focused on the contemporary Romanian standard. The corpus described below is focused on the non-standard aspects of the language, the Regional and the Old Romanian. Having the intention to participate at the PROIEL project, which aligns oldest New Testaments, we annotate the first printed Romanian New Testament (Alba Iulia, 1648). We began by applying the UAIC tools for the morphological and syntactic processing of Contemporary Romanian over the book’s first quarter (second edition). By carefully manually correcting the result of the automated annotation (having a modest accuracy) we obtained a sub-corpus for the training of tools for the Old Romanian processing. But the first edition of the New Testament is written in Cyrillic letters. The existence of books printed in the Old Cyrillic alphabet is a common problem for Romania and The Republic of Moldova, countries where the Romanian is spoken; a problem to solve by the joint efforts of the NLP researchers in the two countries.</abstract>
<identifier type="citekey">malahov-etal-2017-diachronic</identifier>
<identifier type="doi">0.26615/978-954-452-046-5_001</identifier>
<location>
<url>http://doi.org/10.26615/978-954-452-046-5_001</url>
</location>
<part>
<date>2017-09</date>
<extent unit="page">
<start>1</start>
<end>9</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Diachronic Corpus for Romanian (RoDia)
%A Malahov, Ludmila
%A Mărănduc, Cătălina
%A Colesnicov, Alexandru
%Y Dinu, Anca
%Y Osenova, Petya
%Y Vertan, Cristina
%S Proceedings of the First Workshop on Language technology for Digital Humanities in Central and (South-)Eastern Europe
%D 2017
%8 September
%I INCOMA Inc.
%C Varna
%F malahov-etal-2017-diachronic
%X This paper describes a Romanian Dependency Treebank, built at the Al. I. Cuza University (UAIC), and a special OCR techniques used to build it. The corpus has rich morphological and syntactic annotation. There are few annotated representative corpora in Romanian, and the existent ones are mainly focused on the contemporary Romanian standard. The corpus described below is focused on the non-standard aspects of the language, the Regional and the Old Romanian. Having the intention to participate at the PROIEL project, which aligns oldest New Testaments, we annotate the first printed Romanian New Testament (Alba Iulia, 1648). We began by applying the UAIC tools for the morphological and syntactic processing of Contemporary Romanian over the book’s first quarter (second edition). By carefully manually correcting the result of the automated annotation (having a modest accuracy) we obtained a sub-corpus for the training of tools for the Old Romanian processing. But the first edition of the New Testament is written in Cyrillic letters. The existence of books printed in the Old Cyrillic alphabet is a common problem for Romania and The Republic of Moldova, countries where the Romanian is spoken; a problem to solve by the joint efforts of the NLP researchers in the two countries.
%R 0.26615/978-954-452-046-5_001
%U http://doi.org/10.26615/978-954-452-046-5_001
%U https://doi.org/0.26615/978-954-452-046-5_001
%P 1-9
Markdown (Informal)
[A Diachronic Corpus for Romanian (RoDia)](http://doi.org/10.26615/978-954-452-046-5_001) (Malahov et al., RANLP 2017)
ACL
- Ludmila Malahov, Cătălina Mărănduc, and Alexandru Colesnicov. 2017. A Diachronic Corpus for Romanian (RoDia). In Proceedings of the First Workshop on Language technology for Digital Humanities in Central and (South-)Eastern Europe, pages 1–9, Varna. INCOMA Inc..