@inproceedings{karan-etal-2012-evaluation,
title = "Evaluation of Classification Algorithms and Features for Collocation Extraction in {C}roatian",
author = "Karan, Vanja Mladen and
{\v{S}}najder, Jan and
Ba{\v{s}}i{\'c}, Bojana Dalbelo",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Do{\u{g}}an, Mehmet U{\u{g}}ur and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)",
month = may,
year = "2012",
address = "Istanbul, Turkey",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/796_Paper.pdf",
pages = "657--662",
abstract = "Collocations can be defined as words that occur together significantly more often than it would be expected by chance. Many natural language processing applications such as natural language generation, word sense disambiguation and machine translation can benefit from having access to information about collocated words. We approach collocation extraction as a classification problem where the task is to classify a given n-gram as either a collocation (positive) or a non-collocation (negative). Among the features used are word frequencies, classical association measures (Dice, PMI, chi2), and POS tags. In addition, semantic word relatedness modeled by latent semantic analysis is also included. We apply wrapper feature subset selection to determine the best set of features. Performance of various classification algorithms is tested. Experiments are conducted on a manually annotated set of bigrams and trigrams sampled from a Croatian newspaper corpus. Best results obtained are 79.8 F1 measure for bigrams and 67.5 F1 measure for trigrams. The best classifier for bigrams was SVM, while for trigrams the decision tree gave the best performance. Features which contributed the most to overall performance were PMI, semantic relatedness, and POS information.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="karan-etal-2012-evaluation">
<titleInfo>
<title>Evaluation of Classification Algorithms and Features for Collocation Extraction in Croatian</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vanja</namePart>
<namePart type="given">Mladen</namePart>
<namePart type="family">Karan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Šnajder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bojana</namePart>
<namePart type="given">Dalbelo</namePart>
<namePart type="family">Bašić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2012-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehmet</namePart>
<namePart type="given">Uğur</namePart>
<namePart type="family">Doğan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Istanbul, Turkey</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Collocations can be defined as words that occur together significantly more often than it would be expected by chance. Many natural language processing applications such as natural language generation, word sense disambiguation and machine translation can benefit from having access to information about collocated words. We approach collocation extraction as a classification problem where the task is to classify a given n-gram as either a collocation (positive) or a non-collocation (negative). Among the features used are word frequencies, classical association measures (Dice, PMI, chi2), and POS tags. In addition, semantic word relatedness modeled by latent semantic analysis is also included. We apply wrapper feature subset selection to determine the best set of features. Performance of various classification algorithms is tested. Experiments are conducted on a manually annotated set of bigrams and trigrams sampled from a Croatian newspaper corpus. Best results obtained are 79.8 F1 measure for bigrams and 67.5 F1 measure for trigrams. The best classifier for bigrams was SVM, while for trigrams the decision tree gave the best performance. Features which contributed the most to overall performance were PMI, semantic relatedness, and POS information.</abstract>
<identifier type="citekey">karan-etal-2012-evaluation</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2012/pdf/796_Paper.pdf</url>
</location>
<part>
<date>2012-05</date>
<extent unit="page">
<start>657</start>
<end>662</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluation of Classification Algorithms and Features for Collocation Extraction in Croatian
%A Karan, Vanja Mladen
%A Šnajder, Jan
%A Bašić, Bojana Dalbelo
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Doğan, Mehmet Uğur
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)
%D 2012
%8 May
%I European Language Resources Association (ELRA)
%C Istanbul, Turkey
%F karan-etal-2012-evaluation
%X Collocations can be defined as words that occur together significantly more often than it would be expected by chance. Many natural language processing applications such as natural language generation, word sense disambiguation and machine translation can benefit from having access to information about collocated words. We approach collocation extraction as a classification problem where the task is to classify a given n-gram as either a collocation (positive) or a non-collocation (negative). Among the features used are word frequencies, classical association measures (Dice, PMI, chi2), and POS tags. In addition, semantic word relatedness modeled by latent semantic analysis is also included. We apply wrapper feature subset selection to determine the best set of features. Performance of various classification algorithms is tested. Experiments are conducted on a manually annotated set of bigrams and trigrams sampled from a Croatian newspaper corpus. Best results obtained are 79.8 F1 measure for bigrams and 67.5 F1 measure for trigrams. The best classifier for bigrams was SVM, while for trigrams the decision tree gave the best performance. Features which contributed the most to overall performance were PMI, semantic relatedness, and POS information.
%U http://www.lrec-conf.org/proceedings/lrec2012/pdf/796_Paper.pdf
%P 657-662
Markdown (Informal)
[Evaluation of Classification Algorithms and Features for Collocation Extraction in Croatian](http://www.lrec-conf.org/proceedings/lrec2012/pdf/796_Paper.pdf) (Karan et al., LREC 2012)
ACL