<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.008517">
<title confidence="0.94866">
IITP: Multiobjective Differential Evolution based Twitter Named Entity
Recognition
</title>
<author confidence="0.993138">
Md Shad Akhtar, Utpal Kumar Sikdar and Asif Ekbal
</author>
<affiliation confidence="0.814653666666667">
Dept of Computer Science and Engineering
IIT Patna
Patna, India
</affiliation>
<email confidence="0.995189">
(shad.pcs15,utpal.sikdar,asif)@iitp.ac.in
</email>
<sectionHeader confidence="0.993848" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.999413909090909">
In this paper we propose a differential evo-
lution (DE) based named entity recogni-
tion (NER) system in twitter data. In
the first step, we develop various NER
systems using different combinations of
the features. We implemented these fea-
tures without using any domain-specific
features and/or resources. As a base clas-
sifier we use Conditional Random Field
(CRF). In the second step, we propose
a DE based feature selection approach to
determine the most relevant set of fea-
tures and its context information. The op-
timized feature set applied to the train-
ing set yields the precision, recall and F-
measure values of 60.68%, 29.65% and
39.84%, respectively for the fine-grained
named entity (NE) types. When we con-
sider only the coarse-grained NE types, it
shows the precision, recall and F-measure
values of 63.43%, 51.44% and 56.81%, re-
spectively.
</bodyText>
<sectionHeader confidence="0.998993" genericHeader="introduction">
1 Introduction
</sectionHeader>
<bodyText confidence="0.999964">
During the last few years there has been a phe-
nomenal growth in the number of users that make
use of different social networking platforms to
share their opinions and views. Twitter now has
upto over 500 million users with approx 302 mil-
lion active users 1. One can easily imagine that
amount of tweets generated per day would be
enormous i.e. almost 500 million tweets per day
2. These information are usually unstructured and
noisy in nature. The reason behind its unstruc-
tured nature is that tweets are rather short mes-
sages (constitute upto 140 characters only), con-
tains several grammatical &amp; spelling mistakes etc.
</bodyText>
<footnote confidence="0.827418666666667">
1http://en.wikipedia.org/wiki/Twitter
2http://www.cnet.com/news/report-twitter-hits-half-a-
billion-tweets-a-day/
</footnote>
<bodyText confidence="0.999445658536585">
The size limitation bounds a user to invent sev-
eral short forms (e.g. 2mrw, tmrw for tomorrow)
of a valid word which a human mind can interpret
easily but, on the other hand, becomes very diffi-
cult to come up with an accurate system for solv-
ing any problem related to natural language pro-
cessing (NLP). Also in order to show their emo-
tions, users sometime put extra emphasis by elon-
gating a valid word (e.g. yeeesssss!! for yes).
Named entity recognition (NER) can be seen
as one of the important and foremost tasks for
many natural language processing (NLP) tasks
such as machine translation, information extrac-
tion, question-answering etc. The task of NER
can be thought of as a two-step process that in-
volves identifying proper names from the text and
classifying them into some predefined categories
such as person, organization, location etc. Al-
though the techniques (Bikel et al., 1999; Ekbal
and Bandyopadhyay, 2008a; Ekbal and Bandy-
opadhyay, 2008b; Sikdar et al., 2012) for recog-
nizing named entities (NEs) in newswire and other
well-formatted traditional corpus has already ma-
tured but it is still a challenging task to perform in
unstructured and noisy twitter data.
The concept of NER in twitter has recently
drawn the attention of researchers worldwide.
Very few authors have reported their works (Liu
et al., 2011; Ramage et al., 2009; Li et al., 2012)
for NER in twitter. A semi-supervised model for
NER has been reported in (Liu et al., 2011) where
K-nearest neighbour classifier is combined with
CRF. Application of LabeledLDA (Ramage et al.,
2009) in supervised environment can be found in
(Ritter et al., 2011). Their method classifies NEs
into fine-grained types of 10 classes (as in our
case). In another work (Li et al., 2012), authors
have used random walk model to build an unsu-
pervised approach to NER. They modelled their
system on local(tweets) and global (www) context
without employing any of the linguistic features.
</bodyText>
<page confidence="0.99261">
61
</page>
<note confidence="0.791954">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 61–67,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<bodyText confidence="0.9983463">
Few more related works can be found in (Derczyn-
ski et al., 2015) and (Locke and Martin, 2009).
Due to several challenges it pose, recently there
has been a huge interest to identify NE in twit-
ter data. In compliance with this a shared task
“ACL2015 W-NUT: Named Entity Recognition
in Twitter”3 was organized. The work that we
report here is a part of this shared task. The
main objective of the shared task was to effi-
ciently identify various coarse-grained and fine-
grained named entities. Fine-grained NE types
include 10 different categories namely, person,
product, company, geo-loc, movie, musicartist,
tvshow, facility, sportsteam and other. We have
used a rich feature set based on lexical and syntac-
tic properties of a tweet as discussed in Section
3.9. Our proposed work uses Conditional Ran-
dom Field (CRF) (Lafferty et al., 2001) as learning
algorithm, which is very efficient as a sequence
learner. Subsequently we have applied Differen-
tial Evolution (DE), a stochastic, population based
optimization algorithm, introduced by Storn and
Prince in 1996 (Storn and Price, 1997), to obtain
the optimal feature set for NER in twitter data.
The organization of the paper is as follows. Sec-
tion 2 provides a very brief theoretical discussion
of DE. Feature set and methodology used in the
proposed work are discussed in Section 3. Experi-
mental result and analysis can be found in Section
4. We conclude the paper in Section 5.
</bodyText>
<sectionHeader confidence="0.988467" genericHeader="method">
2 MultiObjective Differential Evolution
(DE)
</sectionHeader>
<bodyText confidence="0.999560235294118">
Differential Evolution (DE) (Storn and Price,
1997) is a heuristic search optimization technique
and it provides near optimal solution for an opti-
mization problem. Within a search space the pa-
rameters are encoded in the form of string, which
is called chromosome/vector. A chromosome is,
therefore, nothing but of D number of real val-
ues. A collection of such types of chromosomes
is called population. A fitness value is associ-
ated with each chromosome. For single objective
optimization the fitness value depends upon the
these D number of real parameters. For multiob-
jective optimization, more than one fitness value
is associated with each chromosome. The fitness
value denotes the goodness of the chromosome.
DE generates new vector by adding the weighted
difference between two vectors to the third vec-
</bodyText>
<footnote confidence="0.681933">
3http://noisy-text.github.io/
</footnote>
<bodyText confidence="0.999687416666667">
tor. This operation is called the mutation. In the
next step, the mutant vector parameters are mixed
with the parameters of the predefined vector. The
new vector is termed as the trial vector, and the
parameter mixing process is called crossover. The
best vectors are selected from the trial vectors. The
process of selecting new vectors from the current
population is known as selection. The algorithm
that we follow for this is known as the crowding
distance sorting algorithm. The processes of mu-
tation, crossover and selection continue for a fixed
number of generation.
</bodyText>
<sectionHeader confidence="0.997485" genericHeader="method">
3 Methods
</sectionHeader>
<bodyText confidence="0.999964090909091">
The proposed system is consisting of two steps.
In the very first step we generate many models
based on the best fitting feature sets. Following
this heuristic based approach we select the best
model by fine-tuning on the development data. In
the second step we develop a multiobjective DE
based feature selection approach to find out the
best feature combinations and its contextual infor-
mation from the selected feature set. Schematic
diagram of the proposed system is depicted in fig-
ure 1.
</bodyText>
<subsectionHeader confidence="0.999444">
3.1 Problem Formulation
</subsectionHeader>
<bodyText confidence="0.999889428571429">
Suppose there are D features available, and these
are denoted by Fl,..., FD, where A = {Fi : i =
1; D} Determine the subset of features A� C A
such that we learn a classifier with these subset of
features and optimize some metrics. In our pro-
posed multiobjective DE, we optimize two func-
tions, namely precision and recall.
</bodyText>
<subsectionHeader confidence="0.9959875">
3.2 Problem Representation and Population
Initialization
</subsectionHeader>
<bodyText confidence="0.99998675">
All the chromosomes are initialized with the bi-
nary values of either 0 or 1, where 1 denotes that
the corresponding feature is present and 0 denotes
that the corresponding feature is off. Total number
of available features denote the length of the chro-
mosome, and we set this as D. A classifier learns
with the available set of features. One example of
chromosome representation is shown in Figure 2.
</bodyText>
<subsectionHeader confidence="0.999131">
3.3 Fitness Computation
</subsectionHeader>
<bodyText confidence="0.999882">
The fitness computation corresponds to determin-
ing the values precision and recall as two objective
functions. If M number of features are present in
the chromosome, a classifier is trained with these
</bodyText>
<page confidence="0.997791">
62
</page>
<figureCaption confidence="0.999643">
Figure 1: Proposed methodology (a) Step 1 (b)
Step 2.
Figure 2: Chromosome representation: Here
</figureCaption>
<bodyText confidence="0.983751">
#available features = 15 and #features present =
8
M number of features. The classifier is then evalu-
ated on the development data. We calculate preci-
sion and recall as the two objective functions. The
goal is to maximize these two functions.
</bodyText>
<subsectionHeader confidence="0.986755">
3.4 Mutation
</subsectionHeader>
<bodyText confidence="0.830417333333333">
In mutation process, a mutant vector Vi,G+1 is
generated for each target vector Xi,G; i =
1, 2, 3, ... , NP, according to
</bodyText>
<equation confidence="0.998446">
Vi,G+1 = xr1,G + F x (xr2,G − xr3,G), (1)
</equation>
<bodyText confidence="0.99996925">
where r1, r2, r3 are generated randomly with dif-
ferent indices, not equals to current index i and be-
long to 11, 2, ... , NP}, G is the generation num-
ber and F is the mutant factor which is set to 0.5. If
the parameters of mutant vector vi,j,G+1 &gt; 1, then
the parameter values are set to 1. If the parameters
of mutant vector vi,j,G+1 &lt; 0, then the parameter
values are set to 0.
</bodyText>
<subsectionHeader confidence="0.753072">
3.5 Crossover
</subsectionHeader>
<bodyText confidence="0.998974833333333">
To generate better solutions (represented by the
chromosomes) to the next generation population,
crossover is needed. The parameter mixing of
the target vector Xi,G and mutant vector Vi,G+1 is
called crossover. Crossover generates a trial vector
as follows:
</bodyText>
<equation confidence="0.78043025">
Ui,G+1 = (u1,i,G+1, u2,i,G+1, ... , uD,i,G+1) (2)
where
uj,i,G+1 = vj,i,G+1 if (rj &lt; CR) or j = ir (3)
= xj,i,G if (rj &gt; CR) and j =� ir (4)
</equation>
<bodyText confidence="0.992926142857143">
for j = 1, 2, ... , D, where rj is an uniform ran-
dom number of the jth evaluation which belongs
to [0, 1] and CR is crossover constant which is set
to 0.5. The index value ir belongs to 11, 2, ... , D}
that ensures that at least one parameter of trial vec-
tors Ui,G+1 gets one parameter from the mutant
vector Vi,G+1.
</bodyText>
<subsectionHeader confidence="0.993436">
3.6 Selection
</subsectionHeader>
<bodyText confidence="0.999989473684211">
In selection process, trial vectors are merged with
the current population to get the best NP solu-
tions from the merged solutions 2 x NP in the
next generation population. The merged solutions
are sorted based on dominated and non-dominated
concept and generate ranked solutions. As an ex-
ample, the dominated and non-dominated sorting
are shown in Figure 3. The non-dominated solu-
tions are represented in the pareto-optimal surface.
The non-dominated solutions are added to the next
generation population until the number of solu-
tions becomes equal to NP. If the number of so-
lutions in a particular rank exceeds NP, then it is
sorted based on crowding distance algorithm. The
required number of solutions are added from the
beginning of the sorted rank to make NP number
solutions in the next generation population. The
selection process determines the best NP number
of solutions in the next generation population.
</bodyText>
<subsectionHeader confidence="0.964684">
3.7 Termination Condition
</subsectionHeader>
<bodyText confidence="0.99490725">
Mutation, fitness computation, crossover and se-
lection processes run for a maximum number of
generations. At the end, we get a set of non-
dominated solutions.
</bodyText>
<page confidence="0.998323">
63
</page>
<figureCaption confidence="0.9907955">
Figure 3: Representation of dominated and non-
dominated solutions
</figureCaption>
<subsectionHeader confidence="0.995903">
3.8 Selecting the best solution
</subsectionHeader>
<bodyText confidence="0.999969785714286">
The multiobjective optimization (MOO) based al-
gorithm yields a set of solutions on the Pareto op-
timal front at the end. None of these solutions
is better compared to the others. However, we
may often require to find out a solution at the
end. Depending upon the user’s requirements dif-
ferent criteria for selecting the best solutions can
exist. Each feature vector of the final Pareto op-
timal front generates a classifier. We compute the
F-measure value on the development set for each
classifier. We select the solution which reports
highest F-measure value. The features encoded in
this chromosome is used to train a CRF and report
the final evaluation on the test data.
</bodyText>
<subsectionHeader confidence="0.983509">
3.9 Feature Set
</subsectionHeader>
<bodyText confidence="0.9912235">
In this section we describe the features that we
implement for performing NER. The features
are domain-independent and we implement these
without using any external resources and/or tools.
</bodyText>
<listItem confidence="0.9033338">
1. Local context: We use local contextual in-
formation as the features of CRF. We use pre-
vious few and succeeding few words as the
features for learning.
2. Part-of-Speech information: PoS informa-
</listItem>
<bodyText confidence="0.955363">
tion is one of the prominent features in iden-
tifying the NE. We have used CMU-ARK
Twitter NLP tool4 for extracting the PoS in-
formation. We use the PoS information of
preceding and succeeding few tokens as the
features.
</bodyText>
<footnote confidence="0.931996">
4http://www.ark.cs.cmu.edu/TweetNLP/
</footnote>
<listItem confidence="0.990649431818182">
3. Word length: From the given training data
we observed that NEs generally become
longer in lengths. We define a feature that
is set to high if the length of the candidate
token exceeds a predetermined threshold. In
our case we assume the token to be a NE if if
its length exceeds 5 characters.
4. Suffix and Prefix: Suffixes and prefixes of
length upto 4 characters of the current word
are used as the features.
5. Word normalization: We normalize the cur-
rent token and use it as a feature. For normal-
ization we map the capitalized letter to ‘A’,
small letter to ‘a’ and numbers or symbols to
‘x’.
6. Previous word: We prepare a list of most
frequent words that appear before a NE in the
training data. A binary valued feature is then
defined that fires if the current word appears
in this list.
7. Stop word: This checks whether the current
word appears in the list of stop words or not.
We obtain the list of stop words available at
5.
8. Uppercase: This feature checks whether the
current word starts with a capital letter or
contains a upper case letter inside the word
or all the characters of the word are capital-
ized.
9. All digit: This feature checks whether the
current token is consisting of only digits.
10. AlphaDigit: Tokens having combination of
alphabet and digit have less probability of be-
ing a NE. This concept is used to define a bi-
nary feature in the proposed work which fires
when the token is alphanumeric.
11. First &amp; last word: Tweet level information
are employed for defining two features i.e. if
the current token is the first or last word of a
particular tweet.
12. Word frequency: We observe that most fre-
quently occurring words have a tendency of
not being NE. We prepare a list of most fre-
quent words from the training data. A binary
</listItem>
<footnote confidence="0.997354">
5http://ir.dcs.gla.ac.uk/resources/linguistic utils/stop words
</footnote>
<page confidence="0.999545">
64
</page>
<bodyText confidence="0.998157222222222">
valued feature is then defined that checks
whether the current word appears in this list
or not.
13. Gazetteer: We prepare a list of NEs from
the training and development datasets. Along
with the NE we also store the NE types. We
define an integer-valued feature that takes the
value that corresponds to the respective NE
type.
</bodyText>
<sectionHeader confidence="0.993741" genericHeader="evaluation">
4 Datasets and Experiments
</sectionHeader>
<bodyText confidence="0.999795">
In this section we firstly describe the datasets and
then report the evaluation results.
</bodyText>
<subsectionHeader confidence="0.997744">
4.1 Data Set
</subsectionHeader>
<bodyText confidence="0.999861882352941">
As discussed earlier, objective of the shared task
was to identify both the coarse-grained and fine-
grained NE from the tweets. Shared task orga-
nizers provided two separate versions of train-
ing (trainnotype and train10type) datasets and
four versions of development datasets (devnotype,
dev2015notype, dev10type and dev201510type).
The training dataset comprise of 1,795 tweets
while development datasets comprises of 599 &amp;
420 tweets for dev and dev2015, respectively. A
total of 1,768 NEs are present in the dataset, out
of which 1,140 are present in the training set and
rest 628 are present in the development set. Brief
statistics of the datasets are shown in Table 1 and
Table 2 for the coarse-grained NE tagged and fine-
grained NE tagged datasets, respectively. Gold
standard test datasets comprise of 1,000 tweets.
</bodyText>
<table confidence="0.998928">
Dataset # Tweets # Token # NE
train 1795 34899 1140
dev 599 11570 356
dev2015 420 6789 272
test2015 1000 16261 -
</table>
<tableCaption confidence="0.999853">
Table 1: Statistics of the coarse-grained dataset
</tableCaption>
<subsectionHeader confidence="0.990544">
4.2 Experimental Results
</subsectionHeader>
<bodyText confidence="0.824382833333333">
As a base learning algorithm we make use of
Conditional Random Field (CRF)(Lafferty et al.,
2001). We use the CRF++ 6 based package for
our experiments. Evaluation of all the systems
are performed in compliance with CoNLL 2002
evaluation script7 as recommended in the shared
</bodyText>
<footnote confidence="0.9991485">
6http://taku910.github.io/crfpp/
7http://www.cnts.ua.ac.be/conll2002/ner/bin/conlleval.txt
</footnote>
<table confidence="0.999642727272727">
Types train dev dev2015
person 332 117 73
product 79 18 9
company 130 41 33
geo-loc 218 58 46
movie 31 3 3
musicartist 43 12 13
tvshow 26 8 6
facility 84 20 7
sportsteam 33 18 35
other 164 61 47
</table>
<tableCaption confidence="0.999552">
Table 2: Statistics of the fine-grained dataset.
</tableCaption>
<bodyText confidence="0.999928088235294">
task. For comparative analysis a baseline system
was also provided by the organizers for both fine-
grained and coarse-grained versions. We started
our experiments by training the model on the fea-
tures defined in Section 3.9. Iteratively we have
trained, tested and evaluated the system in order to
find out the best fitting feature sets. Afterwards we
shifted our focus to DE for optimizing the feature
set in terms of relevant features and its context in-
formation. DE was initialized with the population
size equal to 100, and it was executed for 50 gener-
ations. We have carried out these experiments for
both fine-grained and coarse-grained datasets. On
termination, multiobjective differential evolution
(MODE) reported optimized feature combinations
for both the types of datasets. At the final step
these optimized feature combinations were used to
build the final system. We show the optimized fea-
ture sets as determined by MODE in Table 3.
Results of various models along with the base-
line are reported in Table 4. The upper half of the
table contains the experimental results for three
systems. These three models correspond to the of-
ficial baseline model, model developed with all the
features and the model developed with the selected
features of DE. The MODE based feature selec-
tion model yields the F-measure value of 56.81%
for the test2015 dataset. It is evident that it per-
forms well above the official baseline that showed
the F-measure value of 49.88%. Similarly for the
fine-grained NE types (lower half of the table) our
system (39.84% F-measure) is convincingly ahead
of the baseline model (31.97% F-measure) for the
official test data (test2015).
</bodyText>
<page confidence="0.997951">
65
</page>
<table confidence="0.999893368421053">
Types Dataset Model Precision Recall F-measure Accuracy
notype dev Baseline 65.25 55.90 60.21 96.95
All features 65.08 57.58 61.10 96.92
MODE 69.81 62.36 65.88 97.12
dev2015 Baseline 55.79 49.82 52.63 95.08
All features 51.49 50.92 51.21 94.31
MODE 60.97 53.51 57.43 95.60
test2015 Baseline 53.86 46.44 49.88 95.01
All features 52.37 56.32 54.27 95.55
MODE 63.43 51.44 56.81 95.50
10type dev Baseline 57.04 44.38 49.92 96.44
All features 61.23 39.04 47.68 96.29
MODE 70.71 39.33 50.54 96.43
dev2015 Baseline 38.53 30.88 34.29 94.14
All features 37.14 23.90 29.08 93.50
MODE 48.33 24.26 32.35 94.33
test2015 Baseline 35.56 29.05 31.97 93.41
All features 42.41 30.00 35.14 94.94
MODE 60.68 29.65 39.84 94.54
</table>
<tableCaption confidence="0.997421">
Table 4: Results of various systems on different dataset. All values are in %.
</tableCaption>
<table confidence="0.999818625">
Features C-grained F-grained
POS √ √
WordLength √ √
Suffix √ √
Prefix √ √
WordNorm √ √
PrevOccur √
Stop word
InitCap √
AllCap
InnerCap √
AllDigit
AlphaDigit √ √
First &amp; last word
WordFreq
Gazetteer √
</table>
<tableCaption confidence="0.999011">
Table 3: Optimized feature sets.
</tableCaption>
<sectionHeader confidence="0.995429" genericHeader="conclusions">
5 Conclusion
</sectionHeader>
<bodyText confidence="0.999426444444445">
In this paper we have presented our works that
we carried out as part of our participation in the
Twitter NER shared task. We have used a set of
features which were implemented without using
much domain specific resources and/or tools. We
have considered various combinations of features
and finally select the combination that yields the
best result. We further apply MODE based feature
selection on this feature set. Official evaluation
shows F-measure of 39.84% for the fine-grained
NE types and 56.81% F-measure for the coarse-
grained NE type.
In future we would like to carry out more com-
prehensive analysis on the evaluation results. The
features that we used here are very general in
nature. In future we would like to investigate
domain-specific features to improve the accuracy
of the system.
</bodyText>
<sectionHeader confidence="0.999276" genericHeader="references">
References
</sectionHeader>
<reference confidence="0.998740466666667">
Daniel M. Bikel, Richard Schwartz, and Ralph M.
Weischedel. 1999. An algorithm that learns what’s
in a name. Mach. Learn., 34(1-3):211–231, Febru-
ary.
Leon Derczynski, Diana Maynard, Giuseppe Rizzo,
Marieke van Erp, Genevieve Gorrell, Raphal Troncy,
Johann Petrak, and Kalina Bontcheva. 2015. Anal-
ysis of named entity recognition and linking for
tweets. Information Processing &amp; Management,
51(2):32–49.
Asif Ekbal and Sivaji Bandyopadhyay. 2008a. Bengali
named entity recognition using support vector ma-
chine. In Third International Joint Conference on
Natural Language Processing, IJCNLP 2008, Hy-
derabad, India, January 7-12, 2008, pages 51–58.
</reference>
<page confidence="0.855086">
66
</page>
<reference confidence="0.999867803921569">
Asif Ekbal and Sivaji Bandyopadhyay. 2008b. Named
entity recognition in indian languages using maxi-
mum entropy approach. Int. J. Comput. Proc. Ori-
ental Lang., 21(3):205–237.
John D. Lafferty, Andrew McCallum, and Fernando
C. N. Pereira. 2001. Conditional Random Fields:
Probabilistic Models for Segmenting and Labeling
Sequence Data. In ICML, pages 282–289.
Chenliang Li, Jianshu Weng, Qi He, Yuxia Yao, An-
witaman Datta, Aixin Sun, and Bu-Sung Lee. 2012.
Twiner: Named entity recognition in targeted twit-
ter stream. In Proceedings of the 35th International
ACM SIGIR Conference on Research and Develop-
ment in Information Retrieval, SIGIR ’12, pages
721–730, New York, NY, USA. ACM.
Xiaohua Liu, Shaodian Zhang, Furu Wei, and Ming
Zhou. 2011. Recognizing named entities in tweets.
In Proceedings of the 49th Annual Meeting of the
Association for Computational Linguistics: Human
Language Technologies - Volume 1, HLT ’11, pages
359–367, Stroudsburg, PA, USA. Association for
Computational Linguistics.
B. Locke and J. Martin. 2009. Named entity recog-
nition: Adapting to microblogging. University of
Colorado.
Daniel Ramage, David Hall, Ramesh Nallapati, and
Christopher D. Manning. 2009. Labeled lda: A su-
pervised topic model for credit attribution in multi-
labeled corpora. In Proceedings of the 2009 Con-
ference on Empirical Methods in Natural Language
Processing: Volume 1 - Volume 1, EMNLP ’09,
pages 248–256, Stroudsburg, PA, USA. Association
for Computational Linguistics.
Alan Ritter, Sam Clark, Mausam, and Oren Etzioni.
2011. Named entity recognition in tweets: An ex-
perimental study. In Proceedings of the Conference
on Empirical Methods in Natural Language Pro-
cessing, EMNLP ’11, pages 1524–1534, Strouds-
burg, PA, USA. Association for Computational Lin-
guistics.
Utpal Kumar Sikdar, Asif Ekbal, and Sriparna Saha.
2012. Differential evolution based feature selection
and classifier ensemble for named entity recogni-
tion. In COLING 2012, 24th International Confer-
ence on Computational Linguistics, Proceedings of
the Conference: Technical Papers, 8-15 December
2012, Mumbai, India, pages 2475–2490.
Rainer Storn and Kenneth Price. 1997. Differential
evolution a simple and efficient heuristic for global
optimization over continuous spaces. J. of Global
Optimization, 11(4):341–359, December.
</reference>
<page confidence="0.999507">
67
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.427316">
<title confidence="0.998699">IITP: Multiobjective Differential Evolution based Twitter Named Entity Recognition</title>
<author confidence="0.988554">Md Shad Akhtar</author>
<author confidence="0.988554">Utpal Kumar Sikdar</author>
<author confidence="0.988554">Asif</author>
<affiliation confidence="0.846775">Dept of Computer Science and IIT Patna,</affiliation>
<email confidence="0.936204">(shad.pcs15,utpal.sikdar,asif)@iitp.ac.in</email>
<abstract confidence="0.993865956521739">In this paper we propose a differential evolution (DE) based named entity recognition (NER) system in twitter data. In the first step, we develop various NER systems using different combinations of the features. We implemented these features without using any domain-specific features and/or resources. As a base classifier we use Conditional Random Field (CRF). In the second step, we propose a DE based feature selection approach to determine the most relevant set of features and its context information. The optimized feature set applied to the training set yields the precision, recall and Fmeasure values of 60.68%, 29.65% and 39.84%, respectively for the fine-grained named entity (NE) types. When we consider only the coarse-grained NE types, it shows the precision, recall and F-measure values of 63.43%, 51.44% and 56.81%, respectively.</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="true">
<authors>
<author>Daniel M Bikel</author>
<author>Richard Schwartz</author>
<author>Ralph M Weischedel</author>
</authors>
<title>An algorithm that learns what’s in a name.</title>
<date>1999</date>
<pages>34--1</pages>
<location>Mach. Learn.,</location>
<contexts>
<context position="2734" citStr="Bikel et al., 1999" startWordPosition="428" endWordPosition="431">al language processing (NLP). Also in order to show their emotions, users sometime put extra emphasis by elongating a valid word (e.g. yeeesssss!! for yes). Named entity recognition (NER) can be seen as one of the important and foremost tasks for many natural language processing (NLP) tasks such as machine translation, information extraction, question-answering etc. The task of NER can be thought of as a two-step process that involves identifying proper names from the text and classifying them into some predefined categories such as person, organization, location etc. Although the techniques (Bikel et al., 1999; Ekbal and Bandyopadhyay, 2008a; Ekbal and Bandyopadhyay, 2008b; Sikdar et al., 2012) for recognizing named entities (NEs) in newswire and other well-formatted traditional corpus has already matured but it is still a challenging task to perform in unstructured and noisy twitter data. The concept of NER in twitter has recently drawn the attention of researchers worldwide. Very few authors have reported their works (Liu et al., 2011; Ramage et al., 2009; Li et al., 2012) for NER in twitter. A semi-supervised model for NER has been reported in (Liu et al., 2011) where K-nearest neighbour classif</context>
</contexts>
<marker>Bikel, Schwartz, Weischedel, 1999</marker>
<rawString>Daniel M. Bikel, Richard Schwartz, and Ralph M. Weischedel. 1999. An algorithm that learns what’s in a name. Mach. Learn., 34(1-3):211–231, February.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Leon Derczynski</author>
<author>Diana Maynard</author>
<author>Giuseppe Rizzo</author>
<author>Marieke van Erp</author>
<author>Genevieve Gorrell</author>
<author>Raphal Troncy</author>
<author>Johann Petrak</author>
<author>Kalina Bontcheva</author>
</authors>
<title>Analysis of named entity recognition and linking for tweets.</title>
<date>2015</date>
<journal>Information Processing &amp; Management,</journal>
<volume>51</volume>
<issue>2</issue>
<marker>Derczynski, Maynard, Rizzo, van Erp, Gorrell, Troncy, Petrak, Bontcheva, 2015</marker>
<rawString>Leon Derczynski, Diana Maynard, Giuseppe Rizzo, Marieke van Erp, Genevieve Gorrell, Raphal Troncy, Johann Petrak, and Kalina Bontcheva. 2015. Analysis of named entity recognition and linking for tweets. Information Processing &amp; Management, 51(2):32–49.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Asif Ekbal</author>
<author>Sivaji Bandyopadhyay</author>
</authors>
<title>Bengali named entity recognition using support vector machine.</title>
<date>2008</date>
<booktitle>In Third International Joint Conference on Natural Language Processing, IJCNLP 2008,</booktitle>
<pages>51--58</pages>
<location>Hyderabad, India,</location>
<contexts>
<context position="2765" citStr="Ekbal and Bandyopadhyay, 2008" startWordPosition="432" endWordPosition="435">ng (NLP). Also in order to show their emotions, users sometime put extra emphasis by elongating a valid word (e.g. yeeesssss!! for yes). Named entity recognition (NER) can be seen as one of the important and foremost tasks for many natural language processing (NLP) tasks such as machine translation, information extraction, question-answering etc. The task of NER can be thought of as a two-step process that involves identifying proper names from the text and classifying them into some predefined categories such as person, organization, location etc. Although the techniques (Bikel et al., 1999; Ekbal and Bandyopadhyay, 2008a; Ekbal and Bandyopadhyay, 2008b; Sikdar et al., 2012) for recognizing named entities (NEs) in newswire and other well-formatted traditional corpus has already matured but it is still a challenging task to perform in unstructured and noisy twitter data. The concept of NER in twitter has recently drawn the attention of researchers worldwide. Very few authors have reported their works (Liu et al., 2011; Ramage et al., 2009; Li et al., 2012) for NER in twitter. A semi-supervised model for NER has been reported in (Liu et al., 2011) where K-nearest neighbour classifier is combined with CRF. Appli</context>
</contexts>
<marker>Ekbal, Bandyopadhyay, 2008</marker>
<rawString>Asif Ekbal and Sivaji Bandyopadhyay. 2008a. Bengali named entity recognition using support vector machine. In Third International Joint Conference on Natural Language Processing, IJCNLP 2008, Hyderabad, India, January 7-12, 2008, pages 51–58.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Asif Ekbal</author>
<author>Sivaji Bandyopadhyay</author>
</authors>
<title>Named entity recognition in indian languages using maximum entropy approach.</title>
<date>2008</date>
<journal>Int. J. Comput. Proc. Oriental Lang.,</journal>
<volume>21</volume>
<issue>3</issue>
<contexts>
<context position="2765" citStr="Ekbal and Bandyopadhyay, 2008" startWordPosition="432" endWordPosition="435">ng (NLP). Also in order to show their emotions, users sometime put extra emphasis by elongating a valid word (e.g. yeeesssss!! for yes). Named entity recognition (NER) can be seen as one of the important and foremost tasks for many natural language processing (NLP) tasks such as machine translation, information extraction, question-answering etc. The task of NER can be thought of as a two-step process that involves identifying proper names from the text and classifying them into some predefined categories such as person, organization, location etc. Although the techniques (Bikel et al., 1999; Ekbal and Bandyopadhyay, 2008a; Ekbal and Bandyopadhyay, 2008b; Sikdar et al., 2012) for recognizing named entities (NEs) in newswire and other well-formatted traditional corpus has already matured but it is still a challenging task to perform in unstructured and noisy twitter data. The concept of NER in twitter has recently drawn the attention of researchers worldwide. Very few authors have reported their works (Liu et al., 2011; Ramage et al., 2009; Li et al., 2012) for NER in twitter. A semi-supervised model for NER has been reported in (Liu et al., 2011) where K-nearest neighbour classifier is combined with CRF. Appli</context>
</contexts>
<marker>Ekbal, Bandyopadhyay, 2008</marker>
<rawString>Asif Ekbal and Sivaji Bandyopadhyay. 2008b. Named entity recognition in indian languages using maximum entropy approach. Int. J. Comput. Proc. Oriental Lang., 21(3):205–237.</rawString>
</citation>
<citation valid="true">
<authors>
<author>John D Lafferty</author>
<author>Andrew McCallum</author>
<author>Fernando C N Pereira</author>
</authors>
<title>Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data. In</title>
<date>2001</date>
<booktitle>ICML,</booktitle>
<pages>282--289</pages>
<contexts>
<context position="4790" citStr="Lafferty et al., 2001" startWordPosition="764" endWordPosition="767">with this a shared task “ACL2015 W-NUT: Named Entity Recognition in Twitter”3 was organized. The work that we report here is a part of this shared task. The main objective of the shared task was to efficiently identify various coarse-grained and finegrained named entities. Fine-grained NE types include 10 different categories namely, person, product, company, geo-loc, movie, musicartist, tvshow, facility, sportsteam and other. We have used a rich feature set based on lexical and syntactic properties of a tweet as discussed in Section 3.9. Our proposed work uses Conditional Random Field (CRF) (Lafferty et al., 2001) as learning algorithm, which is very efficient as a sequence learner. Subsequently we have applied Differential Evolution (DE), a stochastic, population based optimization algorithm, introduced by Storn and Prince in 1996 (Storn and Price, 1997), to obtain the optimal feature set for NER in twitter data. The organization of the paper is as follows. Section 2 provides a very brief theoretical discussion of DE. Feature set and methodology used in the proposed work are discussed in Section 3. Experimental result and analysis can be found in Section 4. We conclude the paper in Section 5. 2 MultiO</context>
<context position="15938" citStr="Lafferty et al., 2001" startWordPosition="2656" endWordPosition="2659">8 NEs are present in the dataset, out of which 1,140 are present in the training set and rest 628 are present in the development set. Brief statistics of the datasets are shown in Table 1 and Table 2 for the coarse-grained NE tagged and finegrained NE tagged datasets, respectively. Gold standard test datasets comprise of 1,000 tweets. Dataset # Tweets # Token # NE train 1795 34899 1140 dev 599 11570 356 dev2015 420 6789 272 test2015 1000 16261 - Table 1: Statistics of the coarse-grained dataset 4.2 Experimental Results As a base learning algorithm we make use of Conditional Random Field (CRF)(Lafferty et al., 2001). We use the CRF++ 6 based package for our experiments. Evaluation of all the systems are performed in compliance with CoNLL 2002 evaluation script7 as recommended in the shared 6http://taku910.github.io/crfpp/ 7http://www.cnts.ua.ac.be/conll2002/ner/bin/conlleval.txt Types train dev dev2015 person 332 117 73 product 79 18 9 company 130 41 33 geo-loc 218 58 46 movie 31 3 3 musicartist 43 12 13 tvshow 26 8 6 facility 84 20 7 sportsteam 33 18 35 other 164 61 47 Table 2: Statistics of the fine-grained dataset. task. For comparative analysis a baseline system was also provided by the organizers fo</context>
</contexts>
<marker>Lafferty, McCallum, Pereira, 2001</marker>
<rawString>John D. Lafferty, Andrew McCallum, and Fernando C. N. Pereira. 2001. Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data. In ICML, pages 282–289.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Chenliang Li</author>
<author>Jianshu Weng</author>
<author>Qi He</author>
<author>Yuxia Yao</author>
<author>Anwitaman Datta</author>
<author>Aixin Sun</author>
<author>Bu-Sung Lee</author>
</authors>
<title>Twiner: Named entity recognition in targeted twitter stream.</title>
<date>2012</date>
<booktitle>In Proceedings of the 35th International ACM SIGIR Conference on Research and Development in Information Retrieval, SIGIR ’12,</booktitle>
<pages>721--730</pages>
<publisher>ACM.</publisher>
<location>New York, NY, USA.</location>
<contexts>
<context position="3208" citStr="Li et al., 2012" startWordPosition="506" endWordPosition="509">t and classifying them into some predefined categories such as person, organization, location etc. Although the techniques (Bikel et al., 1999; Ekbal and Bandyopadhyay, 2008a; Ekbal and Bandyopadhyay, 2008b; Sikdar et al., 2012) for recognizing named entities (NEs) in newswire and other well-formatted traditional corpus has already matured but it is still a challenging task to perform in unstructured and noisy twitter data. The concept of NER in twitter has recently drawn the attention of researchers worldwide. Very few authors have reported their works (Liu et al., 2011; Ramage et al., 2009; Li et al., 2012) for NER in twitter. A semi-supervised model for NER has been reported in (Liu et al., 2011) where K-nearest neighbour classifier is combined with CRF. Application of LabeledLDA (Ramage et al., 2009) in supervised environment can be found in (Ritter et al., 2011). Their method classifies NEs into fine-grained types of 10 classes (as in our case). In another work (Li et al., 2012), authors have used random walk model to build an unsupervised approach to NER. They modelled their system on local(tweets) and global (www) context without employing any of the linguistic features. 61 Proceedings of t</context>
</contexts>
<marker>Li, Weng, He, Yao, Datta, Sun, Lee, 2012</marker>
<rawString>Chenliang Li, Jianshu Weng, Qi He, Yuxia Yao, Anwitaman Datta, Aixin Sun, and Bu-Sung Lee. 2012. Twiner: Named entity recognition in targeted twitter stream. In Proceedings of the 35th International ACM SIGIR Conference on Research and Development in Information Retrieval, SIGIR ’12, pages 721–730, New York, NY, USA. ACM.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Xiaohua Liu</author>
<author>Shaodian Zhang</author>
<author>Furu Wei</author>
<author>Ming Zhou</author>
</authors>
<title>Recognizing named entities in tweets.</title>
<date>2011</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies - Volume 1, HLT ’11,</booktitle>
<pages>359--367</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA.</location>
<contexts>
<context position="3169" citStr="Liu et al., 2011" startWordPosition="498" endWordPosition="501">s identifying proper names from the text and classifying them into some predefined categories such as person, organization, location etc. Although the techniques (Bikel et al., 1999; Ekbal and Bandyopadhyay, 2008a; Ekbal and Bandyopadhyay, 2008b; Sikdar et al., 2012) for recognizing named entities (NEs) in newswire and other well-formatted traditional corpus has already matured but it is still a challenging task to perform in unstructured and noisy twitter data. The concept of NER in twitter has recently drawn the attention of researchers worldwide. Very few authors have reported their works (Liu et al., 2011; Ramage et al., 2009; Li et al., 2012) for NER in twitter. A semi-supervised model for NER has been reported in (Liu et al., 2011) where K-nearest neighbour classifier is combined with CRF. Application of LabeledLDA (Ramage et al., 2009) in supervised environment can be found in (Ritter et al., 2011). Their method classifies NEs into fine-grained types of 10 classes (as in our case). In another work (Li et al., 2012), authors have used random walk model to build an unsupervised approach to NER. They modelled their system on local(tweets) and global (www) context without employing any of the l</context>
</contexts>
<marker>Liu, Zhang, Wei, Zhou, 2011</marker>
<rawString>Xiaohua Liu, Shaodian Zhang, Furu Wei, and Ming Zhou. 2011. Recognizing named entities in tweets. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies - Volume 1, HLT ’11, pages 359–367, Stroudsburg, PA, USA. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>B Locke</author>
<author>J Martin</author>
</authors>
<title>Named entity recognition: Adapting to microblogging.</title>
<date>2009</date>
<institution>University of Colorado.</institution>
<contexts>
<context position="4045" citStr="Locke and Martin, 2009" startWordPosition="642" endWordPosition="645">environment can be found in (Ritter et al., 2011). Their method classifies NEs into fine-grained types of 10 classes (as in our case). In another work (Li et al., 2012), authors have used random walk model to build an unsupervised approach to NER. They modelled their system on local(tweets) and global (www) context without employing any of the linguistic features. 61 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 61–67, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics Few more related works can be found in (Derczynski et al., 2015) and (Locke and Martin, 2009). Due to several challenges it pose, recently there has been a huge interest to identify NE in twitter data. In compliance with this a shared task “ACL2015 W-NUT: Named Entity Recognition in Twitter”3 was organized. The work that we report here is a part of this shared task. The main objective of the shared task was to efficiently identify various coarse-grained and finegrained named entities. Fine-grained NE types include 10 different categories namely, person, product, company, geo-loc, movie, musicartist, tvshow, facility, sportsteam and other. We have used a rich feature set based on lexic</context>
</contexts>
<marker>Locke, Martin, 2009</marker>
<rawString>B. Locke and J. Martin. 2009. Named entity recognition: Adapting to microblogging. University of Colorado.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Daniel Ramage</author>
<author>David Hall</author>
<author>Ramesh Nallapati</author>
<author>Christopher D Manning</author>
</authors>
<title>Labeled lda: A supervised topic model for credit attribution in multilabeled corpora.</title>
<date>2009</date>
<booktitle>In Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 1 - Volume 1, EMNLP ’09,</booktitle>
<pages>248--256</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA.</location>
<contexts>
<context position="3190" citStr="Ramage et al., 2009" startWordPosition="502" endWordPosition="505">er names from the text and classifying them into some predefined categories such as person, organization, location etc. Although the techniques (Bikel et al., 1999; Ekbal and Bandyopadhyay, 2008a; Ekbal and Bandyopadhyay, 2008b; Sikdar et al., 2012) for recognizing named entities (NEs) in newswire and other well-formatted traditional corpus has already matured but it is still a challenging task to perform in unstructured and noisy twitter data. The concept of NER in twitter has recently drawn the attention of researchers worldwide. Very few authors have reported their works (Liu et al., 2011; Ramage et al., 2009; Li et al., 2012) for NER in twitter. A semi-supervised model for NER has been reported in (Liu et al., 2011) where K-nearest neighbour classifier is combined with CRF. Application of LabeledLDA (Ramage et al., 2009) in supervised environment can be found in (Ritter et al., 2011). Their method classifies NEs into fine-grained types of 10 classes (as in our case). In another work (Li et al., 2012), authors have used random walk model to build an unsupervised approach to NER. They modelled their system on local(tweets) and global (www) context without employing any of the linguistic features. 6</context>
</contexts>
<marker>Ramage, Hall, Nallapati, Manning, 2009</marker>
<rawString>Daniel Ramage, David Hall, Ramesh Nallapati, and Christopher D. Manning. 2009. Labeled lda: A supervised topic model for credit attribution in multilabeled corpora. In Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 1 - Volume 1, EMNLP ’09, pages 248–256, Stroudsburg, PA, USA. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Alan Ritter</author>
<author>Sam Clark</author>
<author>Mausam</author>
<author>Oren Etzioni</author>
</authors>
<title>Named entity recognition in tweets: An experimental study.</title>
<date>2011</date>
<booktitle>In Proceedings of the Conference on Empirical Methods in Natural Language Processing, EMNLP ’11,</booktitle>
<pages>1524--1534</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA.</location>
<contexts>
<context position="3471" citStr="Ritter et al., 2011" startWordPosition="549" endWordPosition="552"> (NEs) in newswire and other well-formatted traditional corpus has already matured but it is still a challenging task to perform in unstructured and noisy twitter data. The concept of NER in twitter has recently drawn the attention of researchers worldwide. Very few authors have reported their works (Liu et al., 2011; Ramage et al., 2009; Li et al., 2012) for NER in twitter. A semi-supervised model for NER has been reported in (Liu et al., 2011) where K-nearest neighbour classifier is combined with CRF. Application of LabeledLDA (Ramage et al., 2009) in supervised environment can be found in (Ritter et al., 2011). Their method classifies NEs into fine-grained types of 10 classes (as in our case). In another work (Li et al., 2012), authors have used random walk model to build an unsupervised approach to NER. They modelled their system on local(tweets) and global (www) context without employing any of the linguistic features. 61 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 61–67, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics Few more related works can be found in (Derczynski et al., 2015) and (Locke and Martin, 2009). Due to several challenge</context>
</contexts>
<marker>Ritter, Clark, Mausam, Etzioni, 2011</marker>
<rawString>Alan Ritter, Sam Clark, Mausam, and Oren Etzioni. 2011. Named entity recognition in tweets: An experimental study. In Proceedings of the Conference on Empirical Methods in Natural Language Processing, EMNLP ’11, pages 1524–1534, Stroudsburg, PA, USA. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Utpal Kumar Sikdar</author>
<author>Asif Ekbal</author>
<author>Sriparna Saha</author>
</authors>
<title>Differential evolution based feature selection and classifier ensemble for named entity recognition.</title>
<date>2012</date>
<booktitle>In COLING 2012, 24th International Conference on Computational Linguistics, Proceedings of the Conference: Technical Papers,</booktitle>
<pages>8--15</pages>
<location>Mumbai, India,</location>
<contexts>
<context position="2820" citStr="Sikdar et al., 2012" startWordPosition="441" endWordPosition="444">ut extra emphasis by elongating a valid word (e.g. yeeesssss!! for yes). Named entity recognition (NER) can be seen as one of the important and foremost tasks for many natural language processing (NLP) tasks such as machine translation, information extraction, question-answering etc. The task of NER can be thought of as a two-step process that involves identifying proper names from the text and classifying them into some predefined categories such as person, organization, location etc. Although the techniques (Bikel et al., 1999; Ekbal and Bandyopadhyay, 2008a; Ekbal and Bandyopadhyay, 2008b; Sikdar et al., 2012) for recognizing named entities (NEs) in newswire and other well-formatted traditional corpus has already matured but it is still a challenging task to perform in unstructured and noisy twitter data. The concept of NER in twitter has recently drawn the attention of researchers worldwide. Very few authors have reported their works (Liu et al., 2011; Ramage et al., 2009; Li et al., 2012) for NER in twitter. A semi-supervised model for NER has been reported in (Liu et al., 2011) where K-nearest neighbour classifier is combined with CRF. Application of LabeledLDA (Ramage et al., 2009) in supervise</context>
</contexts>
<marker>Sikdar, Ekbal, Saha, 2012</marker>
<rawString>Utpal Kumar Sikdar, Asif Ekbal, and Sriparna Saha. 2012. Differential evolution based feature selection and classifier ensemble for named entity recognition. In COLING 2012, 24th International Conference on Computational Linguistics, Proceedings of the Conference: Technical Papers, 8-15 December 2012, Mumbai, India, pages 2475–2490.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Rainer Storn</author>
<author>Kenneth Price</author>
</authors>
<title>Differential evolution a simple and efficient heuristic for global optimization over continuous spaces.</title>
<date>1997</date>
<journal>J. of Global Optimization,</journal>
<volume>11</volume>
<issue>4</issue>
<contexts>
<context position="5036" citStr="Storn and Price, 1997" startWordPosition="800" endWordPosition="803">finegrained named entities. Fine-grained NE types include 10 different categories namely, person, product, company, geo-loc, movie, musicartist, tvshow, facility, sportsteam and other. We have used a rich feature set based on lexical and syntactic properties of a tweet as discussed in Section 3.9. Our proposed work uses Conditional Random Field (CRF) (Lafferty et al., 2001) as learning algorithm, which is very efficient as a sequence learner. Subsequently we have applied Differential Evolution (DE), a stochastic, population based optimization algorithm, introduced by Storn and Prince in 1996 (Storn and Price, 1997), to obtain the optimal feature set for NER in twitter data. The organization of the paper is as follows. Section 2 provides a very brief theoretical discussion of DE. Feature set and methodology used in the proposed work are discussed in Section 3. Experimental result and analysis can be found in Section 4. We conclude the paper in Section 5. 2 MultiObjective Differential Evolution (DE) Differential Evolution (DE) (Storn and Price, 1997) is a heuristic search optimization technique and it provides near optimal solution for an optimization problem. Within a search space the parameters are enco</context>
</contexts>
<marker>Storn, Price, 1997</marker>
<rawString>Rainer Storn and Kenneth Price. 1997. Differential evolution a simple and efficient heuristic for global optimization over continuous spaces. J. of Global Optimization, 11(4):341–359, December.</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>