@inproceedings{maragoudakis-etal-2006-dealing,
title = "Dealing with Imbalanced Data using {B}ayesian Techniques",
author = "Maragoudakis, Manolis and
Kermanidis, Katia and
Garbis, Aristogiannis and
Fakotakis, Nikos",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Gangemi, Aldo and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Tapias, Daniel",
booktitle = "Proceedings of the Fifth International Conference on Language Resources and Evaluation ({LREC}{'}06)",
month = may,
year = "2006",
address = "Genoa, Italy",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2006/pdf/503_pdf.pdf",
abstract = "For the present work, we deal with the significant problem of high imbalance in data in binary or multi-class classification problems. We study two different linguistic applications. The former determines whether a syntactic construction (environment) co-occurs with a verb in a natural text corpus consists a subcategorization frame of the verb or not. The latter is called Name Entity Recognition (NER) and it concerns determining whether a noun belongs to a specific Name Entity class. Regarding the subcategorization domain, each environment is encoded as a vector of heterogeneous attributes, where a very high imbalance between positive and negative examples is observed (an imbalance ratio of approximately 1:80). In the NER application, the imbalance between a name entity class and the negative class is even greater (1:120). In order to confront the plethora of negative instances, we suggest a search tactic during training phase that employs Tomek links for reducing unnecessary negative examples from the training set. Regarding the classification mechanism, we argue that Bayesian networks are well suited and we propose a novel network structure which efficiently handles heterogeneous attributes without discretization and is more classification-oriented. Comparing the experimental results with those of other known machine learning algorithms, our methodology performs significantly better in detecting examples of the rare class.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="maragoudakis-etal-2006-dealing">
<titleInfo>
<title>Dealing with Imbalanced Data using Bayesian Techniques</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manolis</namePart>
<namePart type="family">Maragoudakis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katia</namePart>
<namePart type="family">Kermanidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aristogiannis</namePart>
<namePart type="family">Garbis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikos</namePart>
<namePart type="family">Fakotakis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2006-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC’06)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aldo</namePart>
<namePart type="family">Gangemi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Genoa, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>For the present work, we deal with the significant problem of high imbalance in data in binary or multi-class classification problems. We study two different linguistic applications. The former determines whether a syntactic construction (environment) co-occurs with a verb in a natural text corpus consists a subcategorization frame of the verb or not. The latter is called Name Entity Recognition (NER) and it concerns determining whether a noun belongs to a specific Name Entity class. Regarding the subcategorization domain, each environment is encoded as a vector of heterogeneous attributes, where a very high imbalance between positive and negative examples is observed (an imbalance ratio of approximately 1:80). In the NER application, the imbalance between a name entity class and the negative class is even greater (1:120). In order to confront the plethora of negative instances, we suggest a search tactic during training phase that employs Tomek links for reducing unnecessary negative examples from the training set. Regarding the classification mechanism, we argue that Bayesian networks are well suited and we propose a novel network structure which efficiently handles heterogeneous attributes without discretization and is more classification-oriented. Comparing the experimental results with those of other known machine learning algorithms, our methodology performs significantly better in detecting examples of the rare class.</abstract>
<identifier type="citekey">maragoudakis-etal-2006-dealing</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2006/pdf/503_pdf.pdf</url>
</location>
<part>
<date>2006-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dealing with Imbalanced Data using Bayesian Techniques
%A Maragoudakis, Manolis
%A Kermanidis, Katia
%A Garbis, Aristogiannis
%A Fakotakis, Nikos
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Gangemi, Aldo
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Tapias, Daniel
%S Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC’06)
%D 2006
%8 May
%I European Language Resources Association (ELRA)
%C Genoa, Italy
%F maragoudakis-etal-2006-dealing
%X For the present work, we deal with the significant problem of high imbalance in data in binary or multi-class classification problems. We study two different linguistic applications. The former determines whether a syntactic construction (environment) co-occurs with a verb in a natural text corpus consists a subcategorization frame of the verb or not. The latter is called Name Entity Recognition (NER) and it concerns determining whether a noun belongs to a specific Name Entity class. Regarding the subcategorization domain, each environment is encoded as a vector of heterogeneous attributes, where a very high imbalance between positive and negative examples is observed (an imbalance ratio of approximately 1:80). In the NER application, the imbalance between a name entity class and the negative class is even greater (1:120). In order to confront the plethora of negative instances, we suggest a search tactic during training phase that employs Tomek links for reducing unnecessary negative examples from the training set. Regarding the classification mechanism, we argue that Bayesian networks are well suited and we propose a novel network structure which efficiently handles heterogeneous attributes without discretization and is more classification-oriented. Comparing the experimental results with those of other known machine learning algorithms, our methodology performs significantly better in detecting examples of the rare class.
%U http://www.lrec-conf.org/proceedings/lrec2006/pdf/503_pdf.pdf
Markdown (Informal)
[Dealing with Imbalanced Data using Bayesian Techniques](http://www.lrec-conf.org/proceedings/lrec2006/pdf/503_pdf.pdf) (Maragoudakis et al., LREC 2006)
ACL
- Manolis Maragoudakis, Katia Kermanidis, Aristogiannis Garbis, and Nikos Fakotakis. 2006. Dealing with Imbalanced Data using Bayesian Techniques. In Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC’06), Genoa, Italy. European Language Resources Association (ELRA).