@inproceedings{bobicev-etal-2017-tools,
title = "Tools for Building a Corpus to Study the Historical and Geographical Variation of the {R}omanian Language",
author = "Bobicev, Victoria and
M{\u{a}}r{\u{a}}nduc, C{\u{a}}t{\u{a}}lina and
Perez, Cenel Augusto",
editor = "Dinu, Anca and
Osenova, Petya and
Vertan, Cristina",
booktitle = "Proceedings of the First Workshop on Language technology for Digital Humanities in Central and (South-)Eastern {E}urope",
month = sep,
year = "2017",
address = "Varna",
publisher = "INCOMA Inc.",
url = "http://doi.org/10.26615/978-954-452-046-5_002",
doi = "0.26615/978-954-452-046-5_002",
pages = "10--19",
abstract = "Contemporary standard language corpora are ideal for NLP. There are few morphologically and syntactically annotated corpora for Romanian, and those existing or in progress only deal with the Contemporary Romanian standard. However, the necessity to study the dynamics of natural languages gave rise to balanced corpora, containing non-standard texts. In this paper, we describe the creation of tools for processing non-standard Romanian to build a big balanced corpus. We want to preserve in annotated form as many early stages of language as possible. We have already built a corpus in Old Romanian. We also intend to include the South-Danube dialects, remote to the standard language, along with regional forms closer to the standard. We try to preserve data about endangered idioms such as Aromanian, Meglenoromanian and Istroromanian dialects, and calculate the distance between different regional variants, including the language spoken in the Republic of Moldova. This distance, as well as the mutual understanding between the speakers, is the correct criterion for the classification of idioms as different languages, or as dialects, or as regional variants close to the standard.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bobicev-etal-2017-tools">
<titleInfo>
<title>Tools for Building a Corpus to Study the Historical and Geographical Variation of the Romanian Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Victoria</namePart>
<namePart type="family">Bobicev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cătălina</namePart>
<namePart type="family">Mărănduc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cenel</namePart>
<namePart type="given">Augusto</namePart>
<namePart type="family">Perez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Language technology for Digital Humanities in Central and (South-)Eastern Europe</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anca</namePart>
<namePart type="family">Dinu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Petya</namePart>
<namePart type="family">Osenova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cristina</namePart>
<namePart type="family">Vertan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Inc.</publisher>
<place>
<placeTerm type="text">Varna</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Contemporary standard language corpora are ideal for NLP. There are few morphologically and syntactically annotated corpora for Romanian, and those existing or in progress only deal with the Contemporary Romanian standard. However, the necessity to study the dynamics of natural languages gave rise to balanced corpora, containing non-standard texts. In this paper, we describe the creation of tools for processing non-standard Romanian to build a big balanced corpus. We want to preserve in annotated form as many early stages of language as possible. We have already built a corpus in Old Romanian. We also intend to include the South-Danube dialects, remote to the standard language, along with regional forms closer to the standard. We try to preserve data about endangered idioms such as Aromanian, Meglenoromanian and Istroromanian dialects, and calculate the distance between different regional variants, including the language spoken in the Republic of Moldova. This distance, as well as the mutual understanding between the speakers, is the correct criterion for the classification of idioms as different languages, or as dialects, or as regional variants close to the standard.</abstract>
<identifier type="citekey">bobicev-etal-2017-tools</identifier>
<identifier type="doi">0.26615/978-954-452-046-5_002</identifier>
<location>
<url>http://doi.org/10.26615/978-954-452-046-5_002</url>
</location>
<part>
<date>2017-09</date>
<extent unit="page">
<start>10</start>
<end>19</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tools for Building a Corpus to Study the Historical and Geographical Variation of the Romanian Language
%A Bobicev, Victoria
%A Mărănduc, Cătălina
%A Perez, Cenel Augusto
%Y Dinu, Anca
%Y Osenova, Petya
%Y Vertan, Cristina
%S Proceedings of the First Workshop on Language technology for Digital Humanities in Central and (South-)Eastern Europe
%D 2017
%8 September
%I INCOMA Inc.
%C Varna
%F bobicev-etal-2017-tools
%X Contemporary standard language corpora are ideal for NLP. There are few morphologically and syntactically annotated corpora for Romanian, and those existing or in progress only deal with the Contemporary Romanian standard. However, the necessity to study the dynamics of natural languages gave rise to balanced corpora, containing non-standard texts. In this paper, we describe the creation of tools for processing non-standard Romanian to build a big balanced corpus. We want to preserve in annotated form as many early stages of language as possible. We have already built a corpus in Old Romanian. We also intend to include the South-Danube dialects, remote to the standard language, along with regional forms closer to the standard. We try to preserve data about endangered idioms such as Aromanian, Meglenoromanian and Istroromanian dialects, and calculate the distance between different regional variants, including the language spoken in the Republic of Moldova. This distance, as well as the mutual understanding between the speakers, is the correct criterion for the classification of idioms as different languages, or as dialects, or as regional variants close to the standard.
%R 0.26615/978-954-452-046-5_002
%U http://doi.org/10.26615/978-954-452-046-5_002
%U https://doi.org/0.26615/978-954-452-046-5_002
%P 10-19
Markdown (Informal)
[Tools for Building a Corpus to Study the Historical and Geographical Variation of the Romanian Language](http://doi.org/10.26615/978-954-452-046-5_002) (Bobicev et al., RANLP 2017)
ACL