@inproceedings{agic-tadic-2006-evaluating,
title = "Evaluating Morphosyntactic Tagging of {C}roatian Texts",
author = "Agi{\'c}, {\v{Z}}eljko and
Tadi{\'c}, Marko",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Gangemi, Aldo and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Tapias, Daniel",
booktitle = "Proceedings of the Fifth International Conference on Language Resources and Evaluation ({LREC}{'}06)",
month = may,
year = "2006",
address = "Genoa, Italy",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2006/pdf/326_pdf.pdf",
abstract = "This paper describes results of the first successful effort in applying a stochastic strategy or, namely, a second order Markov model paradigm implemented by the TnT trigram tagger to morphosyntactic tagging of Croatian texts. Beside the tagger, for purposes of both training and testing, we had at our disposal only a 100 Kw Croatia Weekly newspaper subcorpus, manually tagged using approximately 1000 different MULTEXT-East v3 morphosyntactic tags. The test basically consisted of randomly assigning a variable size portion of the corpus for the taggers training procedure and also another fixed-size portion, sized at 10{\%} of the corpus, for the tagging procedure itself; this method allowed us not only to provide preliminary results regarding tagger accuracy on Croatian texts, but also to inspect the behavior of the stochastic tagging paradigm in general. The results were then taken from the test case providing 90{\%} of the corpus for training purposes and varied from around 86{\%} in the worst case scenario up to a peak of around 95{\%} correctly assigned full MSD tags. Results on PoS only expectedly reached the human error level, with TnT correctly tagging above 98{\%} of test sets on average. Most MSD errors occurred on types with the highest number of candidate tags per word form nouns, pronouns and adjectives while errors on PoS, although following the same pattern, were almost insignificant. Detailed insight on tagging, F-measure for all PoS categories is provided in the course of the paper along with other facts of interest.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="agic-tadic-2006-evaluating">
<titleInfo>
<title>Evaluating Morphosyntactic Tagging of Croatian Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Željko</namePart>
<namePart type="family">Agić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Tadić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2006-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC’06)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aldo</namePart>
<namePart type="family">Gangemi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Genoa, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes results of the first successful effort in applying a stochastic strategy or, namely, a second order Markov model paradigm implemented by the TnT trigram tagger to morphosyntactic tagging of Croatian texts. Beside the tagger, for purposes of both training and testing, we had at our disposal only a 100 Kw Croatia Weekly newspaper subcorpus, manually tagged using approximately 1000 different MULTEXT-East v3 morphosyntactic tags. The test basically consisted of randomly assigning a variable size portion of the corpus for the taggers training procedure and also another fixed-size portion, sized at 10% of the corpus, for the tagging procedure itself; this method allowed us not only to provide preliminary results regarding tagger accuracy on Croatian texts, but also to inspect the behavior of the stochastic tagging paradigm in general. The results were then taken from the test case providing 90% of the corpus for training purposes and varied from around 86% in the worst case scenario up to a peak of around 95% correctly assigned full MSD tags. Results on PoS only expectedly reached the human error level, with TnT correctly tagging above 98% of test sets on average. Most MSD errors occurred on types with the highest number of candidate tags per word form nouns, pronouns and adjectives while errors on PoS, although following the same pattern, were almost insignificant. Detailed insight on tagging, F-measure for all PoS categories is provided in the course of the paper along with other facts of interest.</abstract>
<identifier type="citekey">agic-tadic-2006-evaluating</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2006/pdf/326_pdf.pdf</url>
</location>
<part>
<date>2006-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Morphosyntactic Tagging of Croatian Texts
%A Agić, Željko
%A Tadić, Marko
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Gangemi, Aldo
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Tapias, Daniel
%S Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC’06)
%D 2006
%8 May
%I European Language Resources Association (ELRA)
%C Genoa, Italy
%F agic-tadic-2006-evaluating
%X This paper describes results of the first successful effort in applying a stochastic strategy or, namely, a second order Markov model paradigm implemented by the TnT trigram tagger to morphosyntactic tagging of Croatian texts. Beside the tagger, for purposes of both training and testing, we had at our disposal only a 100 Kw Croatia Weekly newspaper subcorpus, manually tagged using approximately 1000 different MULTEXT-East v3 morphosyntactic tags. The test basically consisted of randomly assigning a variable size portion of the corpus for the taggers training procedure and also another fixed-size portion, sized at 10% of the corpus, for the tagging procedure itself; this method allowed us not only to provide preliminary results regarding tagger accuracy on Croatian texts, but also to inspect the behavior of the stochastic tagging paradigm in general. The results were then taken from the test case providing 90% of the corpus for training purposes and varied from around 86% in the worst case scenario up to a peak of around 95% correctly assigned full MSD tags. Results on PoS only expectedly reached the human error level, with TnT correctly tagging above 98% of test sets on average. Most MSD errors occurred on types with the highest number of candidate tags per word form nouns, pronouns and adjectives while errors on PoS, although following the same pattern, were almost insignificant. Detailed insight on tagging, F-measure for all PoS categories is provided in the course of the paper along with other facts of interest.
%U http://www.lrec-conf.org/proceedings/lrec2006/pdf/326_pdf.pdf
Markdown (Informal)
[Evaluating Morphosyntactic Tagging of Croatian Texts](http://www.lrec-conf.org/proceedings/lrec2006/pdf/326_pdf.pdf) (Agić & Tadić, LREC 2006)
ACL