@inproceedings{huang-etal-2010-predicting,
title = "Predicting Morphological Types of {C}hinese Bi-Character Words by Machine Learning Approaches",
author = "Huang, Ting-Hao and
Ku, Lun-Wei and
Chen, Hsin-Hsi",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Rosner, Mike and
Tapias, Daniel",
booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}'10)",
month = may,
year = "2010",
address = "Valletta, Malta",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2010/pdf/397_Paper.pdf",
abstract = {This paper presented an overview of Chinese bi-character words morphological types, and proposed a set of features for machine learning approaches to predict these types based on composite characters information. First, eight morphological types were defined, and 6,500 Chinese bi-character words were annotated with these types. After pre-processing, 6,178 words were selected to construct a corpus named Reduced Set. We analyzed Reduced Set and conducted the inter-annotator agreement test. The average kappa value of 0.67 indicates a substantial agreement. Second, Bi-character words morphological types are considered strongly related with the composite characters parts of speech in this paper, so we proposed a set of features which can simply be extracted from dictionaries to indicate the characters tendency of parts of speech. Finally, we used these features and adopted three machine learning algorithms, SVM, CRF, and Na{\"\i}ve Bayes, to predict the morphological types. On the average, the best algorithm CRF achieved 75{\%} of the annotators performance.},
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huang-etal-2010-predicting">
<titleInfo>
<title>Predicting Morphological Types of Chinese Bi-Character Words by Machine Learning Approaches</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ting-Hao</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hsin-Hsi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Rosner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Valletta, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presented an overview of Chinese bi-character words morphological types, and proposed a set of features for machine learning approaches to predict these types based on composite characters information. First, eight morphological types were defined, and 6,500 Chinese bi-character words were annotated with these types. After pre-processing, 6,178 words were selected to construct a corpus named Reduced Set. We analyzed Reduced Set and conducted the inter-annotator agreement test. The average kappa value of 0.67 indicates a substantial agreement. Second, Bi-character words morphological types are considered strongly related with the composite characters parts of speech in this paper, so we proposed a set of features which can simply be extracted from dictionaries to indicate the characters tendency of parts of speech. Finally, we used these features and adopted three machine learning algorithms, SVM, CRF, and Naïve Bayes, to predict the morphological types. On the average, the best algorithm CRF achieved 75% of the annotators performance.</abstract>
<identifier type="citekey">huang-etal-2010-predicting</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2010/pdf/397_Paper.pdf</url>
</location>
<part>
<date>2010-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Predicting Morphological Types of Chinese Bi-Character Words by Machine Learning Approaches
%A Huang, Ting-Hao
%A Ku, Lun-Wei
%A Chen, Hsin-Hsi
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Rosner, Mike
%Y Tapias, Daniel
%S Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)
%D 2010
%8 May
%I European Language Resources Association (ELRA)
%C Valletta, Malta
%F huang-etal-2010-predicting
%X This paper presented an overview of Chinese bi-character words morphological types, and proposed a set of features for machine learning approaches to predict these types based on composite characters information. First, eight morphological types were defined, and 6,500 Chinese bi-character words were annotated with these types. After pre-processing, 6,178 words were selected to construct a corpus named Reduced Set. We analyzed Reduced Set and conducted the inter-annotator agreement test. The average kappa value of 0.67 indicates a substantial agreement. Second, Bi-character words morphological types are considered strongly related with the composite characters parts of speech in this paper, so we proposed a set of features which can simply be extracted from dictionaries to indicate the characters tendency of parts of speech. Finally, we used these features and adopted three machine learning algorithms, SVM, CRF, and Naïve Bayes, to predict the morphological types. On the average, the best algorithm CRF achieved 75% of the annotators performance.
%U http://www.lrec-conf.org/proceedings/lrec2010/pdf/397_Paper.pdf
Markdown (Informal)
[Predicting Morphological Types of Chinese Bi-Character Words by Machine Learning Approaches](http://www.lrec-conf.org/proceedings/lrec2010/pdf/397_Paper.pdf) (Huang et al., LREC 2010)
ACL