@inproceedings{kumar-2012-challenges,
title = "Challenges in the development of annotated corpora of computer-mediated communication in {I}ndian Languages: A Case of {H}indi",
author = "Kumar, Ritesh",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Do{\u{g}}an, Mehmet U{\u{g}}ur and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)",
month = may,
year = "2012",
address = "Istanbul, Turkey",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/619_Paper.pdf",
pages = "299--302",
abstract = "The present paper describes an ongoing effort to compile and annotate a large corpus of computer-mediated communication (CMC) in Hindi. It describes the process of the compilation of the corpus, the basic structure of the corpus and the annotation of the corpus and the challenges faced in the creation of such a corpus. It also gives a description of the technologies developed for the processing of the data, addition of the metadata and annotation of the corpus. Since it is a corpus of written communication, it provides quite a distinctive challenge for the annotation process. Besides POS annotation, it will also be annotated at higher levels of representation. Once completely developed it will be a very useful resource of Hindi for research in the areas of linguistics, NLP and other social sciences research related to communication, particularly computer-mediated communication..Besides this the challenges discussed here and the way they are tackled could be taken as the model for developing the corpus of computer-mediated communication in other Indian languages. Furthermore the technologies developed for the construction of this corpus will also be made available publicly.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kumar-2012-challenges">
<titleInfo>
<title>Challenges in the development of annotated corpora of computer-mediated communication in Indian Languages: A Case of Hindi</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ritesh</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2012-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehmet</namePart>
<namePart type="given">Uğur</namePart>
<namePart type="family">Doğan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Istanbul, Turkey</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The present paper describes an ongoing effort to compile and annotate a large corpus of computer-mediated communication (CMC) in Hindi. It describes the process of the compilation of the corpus, the basic structure of the corpus and the annotation of the corpus and the challenges faced in the creation of such a corpus. It also gives a description of the technologies developed for the processing of the data, addition of the metadata and annotation of the corpus. Since it is a corpus of written communication, it provides quite a distinctive challenge for the annotation process. Besides POS annotation, it will also be annotated at higher levels of representation. Once completely developed it will be a very useful resource of Hindi for research in the areas of linguistics, NLP and other social sciences research related to communication, particularly computer-mediated communication..Besides this the challenges discussed here and the way they are tackled could be taken as the model for developing the corpus of computer-mediated communication in other Indian languages. Furthermore the technologies developed for the construction of this corpus will also be made available publicly.</abstract>
<identifier type="citekey">kumar-2012-challenges</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2012/pdf/619_Paper.pdf</url>
</location>
<part>
<date>2012-05</date>
<extent unit="page">
<start>299</start>
<end>302</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Challenges in the development of annotated corpora of computer-mediated communication in Indian Languages: A Case of Hindi
%A Kumar, Ritesh
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Doğan, Mehmet Uğur
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)
%D 2012
%8 May
%I European Language Resources Association (ELRA)
%C Istanbul, Turkey
%F kumar-2012-challenges
%X The present paper describes an ongoing effort to compile and annotate a large corpus of computer-mediated communication (CMC) in Hindi. It describes the process of the compilation of the corpus, the basic structure of the corpus and the annotation of the corpus and the challenges faced in the creation of such a corpus. It also gives a description of the technologies developed for the processing of the data, addition of the metadata and annotation of the corpus. Since it is a corpus of written communication, it provides quite a distinctive challenge for the annotation process. Besides POS annotation, it will also be annotated at higher levels of representation. Once completely developed it will be a very useful resource of Hindi for research in the areas of linguistics, NLP and other social sciences research related to communication, particularly computer-mediated communication..Besides this the challenges discussed here and the way they are tackled could be taken as the model for developing the corpus of computer-mediated communication in other Indian languages. Furthermore the technologies developed for the construction of this corpus will also be made available publicly.
%U http://www.lrec-conf.org/proceedings/lrec2012/pdf/619_Paper.pdf
%P 299-302
Markdown (Informal)
[Challenges in the development of annotated corpora of computer-mediated communication in Indian Languages: A Case of Hindi](http://www.lrec-conf.org/proceedings/lrec2012/pdf/619_Paper.pdf) (Kumar, LREC 2012)
ACL