@inproceedings{baxi-bhatt-2022-gujmorph,
title = "{G}uj{MORPH} - A Dataset for Creating {G}ujarati Morphological Analyzer",
author = "Baxi, Jatayu and
Bhatt, Brijesh",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lrec-1.767",
pages = "7088--7095",
abstract = "Computational morphology deals with the processing of a language at the word level. A morphological analyzer is a key linguistic word-level tool that returns all the constituent morphemes and their grammatical categories associated with a particular word form. For the highly inflectional and low resource languages, the creation of computational morphology-related tools is a challenging task due to the unavailability of underlying key resources. In this paper, we discuss the creation of an annotated morphological dataset- GujMORPH for the Gujarati - an indo-aryan language. For the creation of this dataset, we studied language grammar, word formation rules, and suffix attachments in depth. This dataset contains 16,527 unique inflected words along with their morphological segmentation and grammatical feature tagging information. It is a first of its kind dataset for the Gujarati language and can be used to develop morphological analyzer and generator models. The dataset is annotated in the standard Unimorph schema and evaluated on the baseline system. We also describe the tool used to annotate the data in the standard format. The dataset is released publicly along with the library. Using this library, the data can be obtained in a format that can be directly used to train any machine learning model.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="baxi-bhatt-2022-gujmorph">
<titleInfo>
<title>GujMORPH - A Dataset for Creating Gujarati Morphological Analyzer</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jatayu</namePart>
<namePart type="family">Baxi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brijesh</namePart>
<namePart type="family">Bhatt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Thirteenth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Computational morphology deals with the processing of a language at the word level. A morphological analyzer is a key linguistic word-level tool that returns all the constituent morphemes and their grammatical categories associated with a particular word form. For the highly inflectional and low resource languages, the creation of computational morphology-related tools is a challenging task due to the unavailability of underlying key resources. In this paper, we discuss the creation of an annotated morphological dataset- GujMORPH for the Gujarati - an indo-aryan language. For the creation of this dataset, we studied language grammar, word formation rules, and suffix attachments in depth. This dataset contains 16,527 unique inflected words along with their morphological segmentation and grammatical feature tagging information. It is a first of its kind dataset for the Gujarati language and can be used to develop morphological analyzer and generator models. The dataset is annotated in the standard Unimorph schema and evaluated on the baseline system. We also describe the tool used to annotate the data in the standard format. The dataset is released publicly along with the library. Using this library, the data can be obtained in a format that can be directly used to train any machine learning model.</abstract>
<identifier type="citekey">baxi-bhatt-2022-gujmorph</identifier>
<location>
<url>https://aclanthology.org/2022.lrec-1.767</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>7088</start>
<end>7095</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GujMORPH - A Dataset for Creating Gujarati Morphological Analyzer
%A Baxi, Jatayu
%A Bhatt, Brijesh
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Thirteenth Language Resources and Evaluation Conference
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F baxi-bhatt-2022-gujmorph
%X Computational morphology deals with the processing of a language at the word level. A morphological analyzer is a key linguistic word-level tool that returns all the constituent morphemes and their grammatical categories associated with a particular word form. For the highly inflectional and low resource languages, the creation of computational morphology-related tools is a challenging task due to the unavailability of underlying key resources. In this paper, we discuss the creation of an annotated morphological dataset- GujMORPH for the Gujarati - an indo-aryan language. For the creation of this dataset, we studied language grammar, word formation rules, and suffix attachments in depth. This dataset contains 16,527 unique inflected words along with their morphological segmentation and grammatical feature tagging information. It is a first of its kind dataset for the Gujarati language and can be used to develop morphological analyzer and generator models. The dataset is annotated in the standard Unimorph schema and evaluated on the baseline system. We also describe the tool used to annotate the data in the standard format. The dataset is released publicly along with the library. Using this library, the data can be obtained in a format that can be directly used to train any machine learning model.
%U https://aclanthology.org/2022.lrec-1.767
%P 7088-7095
Markdown (Informal)
[GujMORPH - A Dataset for Creating Gujarati Morphological Analyzer](https://aclanthology.org/2022.lrec-1.767) (Baxi & Bhatt, LREC 2022)
ACL