<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.202730">
<title confidence="0.997814">
Improving Twitter Named Entity Recognition using Word
Representations
</title>
<author confidence="0.998029">
Zhiqiang Toh, Bin Chen and Jian Su
</author>
<affiliation confidence="0.978221">
Institute for Infocomm Research
</affiliation>
<address confidence="0.9059655">
1 Fusionopolis Way
Singapore 138632
</address>
<email confidence="0.998051">
{ztoh,bchen,sujian}@i2r.a-star.edu.sg
</email>
<sectionHeader confidence="0.993849" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.9998790625">
This paper describes our system used in
the ACL 2015 Workshop on Noisy User-
generated Text Shared Task for Named
Entity Recognition (NER) in Twitter. Our
system uses Conditional Random Fields to
train two separate classifiers for the two
evaluations: predicting 10 fine-grained
types, and segmenting named entities. We
focus our efforts on generating word rep-
resentations from large amount of unla-
beled newswire data and tweets. Our
experiment results show that cluster fea-
tures derived from word representations
significantly improve Twitter NER perfor-
mances. Our system is ranked 2nd for both
evaluations.
</bodyText>
<sectionHeader confidence="0.998973" genericHeader="introduction">
1 Introduction
</sectionHeader>
<bodyText confidence="0.999605627906977">
Named Entity Recognition (NER) is the task of
identifying and categorizing the various mentions
of people, organizations and other named entities
within the text. NER has been an essential analysis
component in many Natural Language Processing
(NLP) systems, especially information extraction
and question answering.
Traditionally, the NER system is trained and
applied on long and formal text such as the
newswire. From the beginning of the new millen-
nium, user-generated content from the social me-
dia websites such as Twitter and Weibo presents a
huge compilation of informative but noisy and in-
formal text. This rapidly growing text collection
becomes more and more important for NLP tasks
such as sentiment analysis and emerging topic de-
tection.
However, standard NER system trained on for-
mal text does not work well on this new and chal-
lenging style of text. Therefore, adapting the
NER system to the new and challenging Twitter
domain has attracted increasing attention of re-
searchers. The ACL 2015 Workshop on Noisy
User-generated Text (W-NUT) Shared Task for
NER in Twitter is organized in response to these
new changes (Tim Baldwin, 2015).
We participated in the above Shared Task,
which consists of two separate evaluations: one
where the task is to predict 10 fine-grained types
(10types) and the other in which only named entity
segments are predicted (notypes).
For both evaluations, we model the problem as
a sequential labeling task, using Conditional Ran-
dom Fields (CRF) as the training algorithm. An
additional postprocessing step is applied to further
refine the system output.
The remainder of this paper is structured as fol-
lows. In Section 2, we report on the external re-
sources used by our system and how they are ob-
tained and processed. In Section 3, the features
used are described in details. In Section 4, the ex-
periment and official results are presented. Finally,
Section 5 summarizes our work.
</bodyText>
<sectionHeader confidence="0.986226" genericHeader="method">
2 External Resources
</sectionHeader>
<bodyText confidence="0.9997916">
External resources have shown to improve the per-
formances of Twitter NER (Ritter et al., 2011).
Our system uses a variety of external resources,
either publicly available, or collected and prepro-
cessed by us.
</bodyText>
<subsectionHeader confidence="0.998137">
2.1 Freebase Entity Lists
</subsectionHeader>
<bodyText confidence="0.9996542">
We use the Freebase entity lists provided by the
task organizers. For some of the lists that are not
provided (e.g. a list of sports facilities), we man-
ually collect them by calling the appropriate Free-
base API.
</bodyText>
<subsectionHeader confidence="0.998398">
2.2 Unlabeled Corpora
</subsectionHeader>
<bodyText confidence="0.988585333333333">
We gather unlabeled corpora from three differ-
ent sources: (1) Pre-trained word vectors gen-
erated using the GloVe tool (Pennington et al.,
</bodyText>
<page confidence="0.976238">
141
</page>
<note confidence="0.787061">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 141–145,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<bodyText confidence="0.970094083333333">
2014)1, (2) English Gigaword Fifth Edition2, and
(3) raw tweets collected between the period of
March 2015 and April 2015.
For English Gigaword, all articles of story type
are collected and tokenized. Further preprocess-
ing is performed by following the cleaning step
described in Turian et al. (2010). This results in
a corpus consisting of 76 million sentences.
The collected raw tweets are tokenized3
and non-English tweets are removed using
langid.py (Lui and Baldwin, 2012), resulting
in a total of 14 million tweets.
</bodyText>
<sectionHeader confidence="0.999362" genericHeader="method">
3 Features
</sectionHeader>
<bodyText confidence="0.999826">
This section briefly describes the features used in
our system. Besides the features commonly used
in traditional NER systems, we focus on the use
of word cluster features that have shown to be ef-
fective in previous work (Ratinov and Roth, 2009;
Turian et al., 2010; Cherry and Guo, 2015).
</bodyText>
<subsectionHeader confidence="0.999753">
3.1 Word Feature
</subsectionHeader>
<bodyText confidence="0.999962">
The current word and its lowercase format are
used as features. To provide additional context in-
formation, the previous word and next word (in
original format) are also used.
</bodyText>
<subsectionHeader confidence="0.999078">
3.2 Orthographic Features
</subsectionHeader>
<bodyText confidence="0.999976833333333">
Orthographic features based on regular expres-
sions are often used in NER systems. We only use
the following two orthographic features: Initial-
Cap ([A-Z][a-z].*) and AllCaps ([A-Z]+).
In addition, the first character and last two charac-
ters of each word are used as features.
</bodyText>
<subsectionHeader confidence="0.999275">
3.3 Gazetteer Feature
</subsectionHeader>
<bodyText confidence="0.999935333333333">
The current word is matched with entries in the
Freebase entity lists and the feature value is the
type of entity list matched.
</bodyText>
<subsectionHeader confidence="0.996144">
3.4 Word Cluster Features
</subsectionHeader>
<bodyText confidence="0.999600428571429">
Unsupervised word representations (e.g. Brown
clustering) have shown to improve the perfor-
mance of NER. Besides brown clusters, we also
use clusters generated using the K-means algo-
rithm. These two kinds of clusters are generated
from the processed Gigaword and tweet corpora
(Section 2.2).
</bodyText>
<footnote confidence="0.9918875">
1http://nlp.stanford.edu/projects/glove/
2https://catalog.ldc.upenn.edu/LDC2011T07
3The tweet tokenization script can be found at
https://github.com/myleott/ark-twokenize-py
</footnote>
<bodyText confidence="0.999869673913044">
Brown clusters are generated using the imple-
mentation by Percy Liang4. We experiment with
different cluster sizes (1100, 200, 500, 10001), re-
sulting in different cluster files for each of the cor-
pora. For each cluster file, different minimum oc-
currences (15,10, 201) and binary prefix lengths
(14, 6, · · · ,14,16}) are tested. For each word in
the tweet, its corresponding binary prefix string
representation is used as the feature value.
K-means clusters are generated using two dif-
ferent methods. The first method uses the
word2vec tool (Mikolov et al., 2013)5. By vary-
ing the minimum occurrences (15,10, 201), word
vector size (150,100, 200, 500, 10001), cluster
size (150,100,200,500, 10001) and sub-sampling
threshold (10.00001, 0.001}), different cluster
files are generated and tested. Similar to the
Brown cluster feature, the name of the cluster that
each word belongs to is used as the feature value.
The second method uses the GloVe tool to
generate global vectors for word representation6.
As the GloVe tool does not output any form of
clusters, K-mean clusters are generated from the
global vectors using the K-means implementa-
tion from Apache Spark MLlib7. Similarly, by
varying the minimum count (15,10,20,50, 1001),
window size (15,10,15, 201), vector size
(150,100, 200, 500,10001), and cluster size
(150,100, 200, 500, 10001), different cluster files
are generated and tested.
We also generate K-mean cluster files us-
ing the pre-trained GloVe word vectors (trained
from Wikipedia 2014 and Gigaword Fifth Edi-
tion, Common Crawl and Twitter data) in the same
manner.
We create a cluster feature for each cluster file
that is found to improve the 5-fold cross valida-
tion performance. As there are over 800 cluster
files, we only test a random subset of cluster files
each time and select the best cluster file from the
subset to create a new cluster feature. The proce-
dure is repeated for a new subset of cluster files,
until no (or negligible) improvement is obtained.
Our final settings use one Brown cluster feature
and six K-means cluster features (for both 10types
and notypes settings).
</bodyText>
<footnote confidence="0.9982542">
4https://github.com/percyliang/brown-cluster/
5https://code.google.com/p/word2vec/
6Due to memory constraints, only the tweet corpus is used
to generate global vectors.
7https://spark.apache.org/mllib/
</footnote>
<page confidence="0.970501">
142
</page>
<table confidence="0.999656428571429">
10types
Feature Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Overall
Word Feature 25.76 23.93 27.59 28.01 9.69 23.94
+ Orthographic Features 36.48 35.64 41.20 43.27 25.34 37.03
+ Gazetteer Feature 44.36 43.94 48.22 44.84 30.35 42.94
+ Word Cluster Features 55.85 57.49 60.07 58.35 44.99 55.95
+ Postprocessing 56.09 57.82 60.07 58.88 45.78 56.31
</table>
<tableCaption confidence="0.9969005">
Table 1: 5-fold cross-validation F1 performances for the 10types evaluation. Each row uses all features
added in the previous rows.
</tableCaption>
<table confidence="0.999607142857143">
notypes
Feature Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Overall
Word Feature 30.91 30.08 33.41 36.99 20.69 31.09
+ Orthographic Features 52.06 54.29 52.22 53.11 44.49 51.62
+ Gazetteer Feature 52.26 56.70 58.78 56.74 47.45 54.77
+ Word Cluster Features 65.14 65.57 66.77 68.13 55.31 64.66
+ Postprocessing 65.44 65.85 67.30 68.70 56.00 65.13
</table>
<tableCaption confidence="0.971232">
Table 2: 5-fold cross-validation F1 performances for the notypes evaluation. Each row uses all features
added in the previous rows.
</tableCaption>
<sectionHeader confidence="0.994102" genericHeader="evaluation">
4 Experiments and Results
</sectionHeader>
<bodyText confidence="0.9999119">
Our system is trained using the CRF++ tool8. We
trained separate classifiers for the two different
evaluations (10types and notypes).
To select the optimum settings, we make use
of all available training data (train, dev,
dev_2015) and conduct 5-fold cross validation
experiments. For easier comparisons with other
systems, the 5 folds are split such that dev is the
test set for Fold 1, while dev_2015 is the test set
for Fold 5.
</bodyText>
<subsectionHeader confidence="0.998193">
4.1 Preliminary Results on Training Data
</subsectionHeader>
<bodyText confidence="0.999788692307692">
Table 1 and Table 2 shows the 5-fold cross val-
idation performances after adding each feature
group for the 10types and notypes evaluations re-
spectively. The use of word clusters significantly
improves the performances for both evaluations.
There is an overall improvement of 13% and 9%
for the 10types and notypes evaluation respec-
tively when word cluster features are added. This
demonstrates the usefulness of word vectors in im-
proving the accuracy of a Twitter NER system.
Comparing the performances of Fold 1 (tested
on dev) and Fold 5 (tested on dev_2015),
we observe a significant performance difference.
</bodyText>
<footnote confidence="0.793236">
8http://taku910.github.io/crfpp/
</footnote>
<bodyText confidence="0.9995895">
Similar observations can also be seen for the other
three folds (tested on a subset of train) when
compared with Fold 5. This suggests that there
are notable differences between the data provided
during the training period (train and dev) and
evaluation period (dev_2015), probably because
the two sets of data are collected in different time
periods.
</bodyText>
<subsectionHeader confidence="0.989362">
4.2 Postprocessing
</subsectionHeader>
<bodyText confidence="0.999974705882353">
We also experiment with a postprocessing step
based on heuristic rules to further refine the sys-
tem output (last row of Table 1 and Table 2). The
heuristic rules are based on string matching of
words with name list entries. To prevent false pos-
itives, we require entries in some of the name lists
to contain at least two words and should not con-
tain common words/stop words. For certain name
lists where single-word entries are common but
ambiguous (e.g. name of sports clubs), we check
for the presence of cue words in the tweet be-
fore matching. For example, for single-word sport
team names that are common in tweets, we check
for the presence of cue words such as “vs”. Exam-
ples of name lists used include names of profes-
sional athletes, music composers and sport facili-
ties.
</bodyText>
<page confidence="0.998038">
143
</page>
<table confidence="0.997144428571428">
10types notypes
System Rank Precision Recall F1 Rank Precision Recall F1
NLANGP 2 63.62 43.12 51.40 2 67.74 54.31 60.29
1st 1 57.66 55.22 56.41 1 72.20 69.14 70.63
2nd 2 63.62 43.12 51.40 2 67.74 54.31 60.29
3rd 3 53.24 38.58 44.74 3 63.81 56.28 59.81
Baseline – 35.56 29.05 31.97 – 53.86 46.44 49.88
</table>
<tableCaption confidence="0.995939">
Table 3: Comparison of our system (NLANGP) with the top three participating systems and official
baselines for the 10types and notypes evaluations.
</tableCaption>
<table confidence="0.992888">
10types notypes
System Precision Recall F1 Precision Recall F1
NLANGP 63.62 43.12 51.40 67.74 54.31 60.29
- Word Cluster Features 57.99 25.26 35.19 62.56 38.43 47.61
</table>
<tableCaption confidence="0.999472">
Table 4: System performances on the test data when word cluster features are not used.
</tableCaption>
<subsectionHeader confidence="0.998922">
4.3 Evaluation Results
</subsectionHeader>
<bodyText confidence="0.99991">
Table 3 presents the official results of our 10types
and notypes submissions. We also include the re-
sults of the top three participating systems and of-
ficial baselines for comparison.
As shown from the table, our system
(NLANGP) is ranked 2nd for both evalua-
tions. Based on our preliminary Fold 5 perfor-
mances, our system performances on the test data
(test_2015, collected in the same period as
dev_2015) are within expectation. In general,
the fine-grained evaluation is a more challeng-
ing task, as seen from the huge performance
difference between the F1 score of 10types and
notypes.
</bodyText>
<table confidence="0.999908333333333">
Type Precision Recall F1
COMPANY 80.00 41.03 54.24
FACILITY 52.17 31.58 39.34
GEO-LOC 63.81 57.76 60.63
MOVIE 100.00 33.33 50.00
MUSICARTIST 50.00 9.76 16.33
OTHER 50.00 30.30 37.74
PERSON 70.70 64.91 67.68
PRODUCT 20.00 8.11 11.54
SPORTSTEAM 79.41 38.57 51.92
TVSHOW 0.00 0.00 0.00
Overall 63.62 43.12 51.40
</table>
<tableCaption confidence="0.948943">
Table 5: Performance of each fine-grained type of
our system.
</tableCaption>
<bodyText confidence="0.97777705882353">
Table 5 shows the performance of each fine-
grained type of our system. Unlike traditional
NER where state-of-the-art systems can achieve
performances over 90 F1 for the 3 MUC types
(PERSON, LOCATION and ORGANIZATION),
Twitter NER poses new challenges in accurately
extracting entity information in such genre that
does not exist in the past.
We are interested to know the performance con-
tribution of the word clusters on the test data. Ta-
ble 4 shows the performances on the test data
when word cluster features are not used. Simi-
larly to the observations observed in the training
data, word clusters are important features for our
system: a performance drop greater than 16% and
12% is observed for the 10types and notypes eval-
uation respectively.
</bodyText>
<sectionHeader confidence="0.998904" genericHeader="conclusions">
5 Conclusion
</sectionHeader>
<bodyText confidence="0.99987">
In this paper, we describe our system used in the
W-NUT Shared Task for NER in Twitter. We fo-
cus our efforts on improving Twitter NER using
word representations, namely, Brown clusters and
K-means clusters, that are generated from large
amount of unlabeled newswire data and tweets.
Our experiments and evaluation results show that
cluster features derived from word representations
are effective in improving Twitter NER perfor-
mances. In future, we hope to investigate on the
use of distant supervision learning technique to
build better system that can perform more robustly
across tweets from different time periods. We also
like to perform an error analysis to help us under-
stand which other problems persist so as to address
them in future.
</bodyText>
<page confidence="0.998136">
144
</page>
<sectionHeader confidence="0.989782" genericHeader="references">
References
</sectionHeader>
<reference confidence="0.999720907407407">
Colin Cherry and Hongyu Guo. 2015. The Unrea-
sonable Effectiveness of Word Representations for
Twitter Named Entity Recognition. In Proceed-
ings of the 2015 Conference of the North Ameri-
can Chapter of the Association for Computational
Linguistics: Human Language Technologies, pages
735–745, Denver, Colorado, May–June. Association
for Computational Linguistics.
Marco Lui and Timothy Baldwin. 2012. langid.py: An
Off-the-shelf Language Identification Tool. In Pro-
ceedings of the ACL 2012 System Demonstrations,
pages 25–30, Jeju Island, Korea, July. Association
for Computational Linguistics.
Tomas Mikolov, Wen-tau Yih, and Geoffrey Zweig.
2013. Linguistic Regularities in Continuous Space
Word Representations. In Proceedings of the 2013
Conference of the North American Chapter of the
Association for Computational Linguistics: Human
Language Technologies, pages 746–751, Atlanta,
Georgia, June. Association for Computational Lin-
guistics.
Jeffrey Pennington, Richard Socher, and Christopher
Manning. 2014. Glove: Global Vectors for Word
Representation. In Proceedings of the 2014 Con-
ference on Empirical Methods in Natural Language
Processing (EMNLP), pages 1532–1543, Doha,
Qatar, October. Association for Computational Lin-
guistics.
Lev Ratinov and Dan Roth. 2009. Design Chal-
lenges and Misconceptions in Named Entity Recog-
nition. In Proceedings of the Thirteenth Confer-
ence on Computational Natural Language Learning
(CoNLL-2009), pages 147–155, Boulder, Colorado,
June. Association for Computational Linguistics.
Alan Ritter, Sam Clark, Mausam, and Oren Etzioni.
2011. Named Entity Recognition in Tweets: An Ex-
perimental Study. In Proceedings of the 2011 Con-
ference on Empirical Methods in Natural Language
Processing, pages 1524–1534, Edinburgh, Scotland,
UK., July. Association for Computational Linguis-
tics.
Marie Marie Catherine de Marneffe Young-Bum Kim
Alan Ritter Wei Xu Tim Baldwin, Bo Han. 2015.
Findings of the 2015 Workshop on Noisy User-
generated Text. In Proceedings of the Workshop on
Noisy User-generated Text (WNUT 2015). Associa-
tion for Computational Linguistics.
Joseph Turian, Lev-Arie Ratinov, and Yoshua Bengio.
2010. Word Representations: A Simple and General
Method for Semi-Supervised Learning. In Proceed-
ings of the 48th Annual Meeting of the Association
for Computational Linguistics, pages 384–394, Up-
psala, Sweden, July. Association for Computational
Linguistics.
</reference>
<page confidence="0.998808">
145
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.442263">
<title confidence="0.960839">Improving Twitter Named Entity Recognition using Representations</title>
<author confidence="0.886814">Zhiqiang Toh</author>
<author confidence="0.886814">Bin Chen</author>
<author confidence="0.886814">Jian</author>
<affiliation confidence="0.6927465">Institute for Infocomm 1 Fusionopolis</affiliation>
<address confidence="0.663343">Singapore</address>
<email confidence="0.92194">ztoh@i2r.a-star.edu.sg</email>
<email confidence="0.92194">bchen@i2r.a-star.edu.sg</email>
<email confidence="0.92194">sujian@i2r.a-star.edu.sg</email>
<abstract confidence="0.992019823529412">This paper describes our system used in the ACL 2015 Workshop on Noisy Usergenerated Text Shared Task for Named Entity Recognition (NER) in Twitter. Our system uses Conditional Random Fields to train two separate classifiers for the two evaluations: predicting 10 fine-grained types, and segmenting named entities. We focus our efforts on generating word representations from large amount of unlabeled newswire data and tweets. Our experiment results show that cluster features derived from word representations significantly improve Twitter NER performances. Our system is ranked 2nd for both evaluations.</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="true">
<authors>
<author>Colin Cherry</author>
<author>Hongyu Guo</author>
</authors>
<title>The Unreasonable Effectiveness of Word Representations for Twitter Named Entity Recognition.</title>
<date>2015</date>
<booktitle>In Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies,</booktitle>
<pages>735--745</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Denver, Colorado, May–June.</location>
<contexts>
<context position="4380" citStr="Cherry and Guo, 2015" startWordPosition="691" endWordPosition="694"> Further preprocessing is performed by following the cleaning step described in Turian et al. (2010). This results in a corpus consisting of 76 million sentences. The collected raw tweets are tokenized3 and non-English tweets are removed using langid.py (Lui and Baldwin, 2012), resulting in a total of 14 million tweets. 3 Features This section briefly describes the features used in our system. Besides the features commonly used in traditional NER systems, we focus on the use of word cluster features that have shown to be effective in previous work (Ratinov and Roth, 2009; Turian et al., 2010; Cherry and Guo, 2015). 3.1 Word Feature The current word and its lowercase format are used as features. To provide additional context information, the previous word and next word (in original format) are also used. 3.2 Orthographic Features Orthographic features based on regular expressions are often used in NER systems. We only use the following two orthographic features: InitialCap ([A-Z][a-z].*) and AllCaps ([A-Z]+). In addition, the first character and last two characters of each word are used as features. 3.3 Gazetteer Feature The current word is matched with entries in the Freebase entity lists and the featu</context>
</contexts>
<marker>Cherry, Guo, 2015</marker>
<rawString>Colin Cherry and Hongyu Guo. 2015. The Unreasonable Effectiveness of Word Representations for Twitter Named Entity Recognition. In Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pages 735–745, Denver, Colorado, May–June. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Marco Lui</author>
<author>Timothy Baldwin</author>
</authors>
<title>langid.py: An Off-the-shelf Language Identification Tool.</title>
<date>2012</date>
<booktitle>In Proceedings of the ACL 2012 System Demonstrations,</booktitle>
<pages>25--30</pages>
<institution>Jeju Island, Korea, July. Association for Computational Linguistics.</institution>
<contexts>
<context position="4036" citStr="Lui and Baldwin, 2012" startWordPosition="631" endWordPosition="634">e ACL 2015 Workshop on Noisy User-generated Text, pages 141–145, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics 2014)1, (2) English Gigaword Fifth Edition2, and (3) raw tweets collected between the period of March 2015 and April 2015. For English Gigaword, all articles of story type are collected and tokenized. Further preprocessing is performed by following the cleaning step described in Turian et al. (2010). This results in a corpus consisting of 76 million sentences. The collected raw tweets are tokenized3 and non-English tweets are removed using langid.py (Lui and Baldwin, 2012), resulting in a total of 14 million tweets. 3 Features This section briefly describes the features used in our system. Besides the features commonly used in traditional NER systems, we focus on the use of word cluster features that have shown to be effective in previous work (Ratinov and Roth, 2009; Turian et al., 2010; Cherry and Guo, 2015). 3.1 Word Feature The current word and its lowercase format are used as features. To provide additional context information, the previous word and next word (in original format) are also used. 3.2 Orthographic Features Orthographic features based on regul</context>
</contexts>
<marker>Lui, Baldwin, 2012</marker>
<rawString>Marco Lui and Timothy Baldwin. 2012. langid.py: An Off-the-shelf Language Identification Tool. In Proceedings of the ACL 2012 System Demonstrations, pages 25–30, Jeju Island, Korea, July. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Tomas Mikolov</author>
<author>Wen-tau Yih</author>
<author>Geoffrey Zweig</author>
</authors>
<title>Linguistic Regularities in Continuous Space Word Representations.</title>
<date>2013</date>
<booktitle>In Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies,</booktitle>
<pages>746--751</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Atlanta, Georgia,</location>
<contexts>
<context position="6078" citStr="Mikolov et al., 2013" startWordPosition="948" endWordPosition="951">an be found at https://github.com/myleott/ark-twokenize-py Brown clusters are generated using the implementation by Percy Liang4. We experiment with different cluster sizes (1100, 200, 500, 10001), resulting in different cluster files for each of the corpora. For each cluster file, different minimum occurrences (15,10, 201) and binary prefix lengths (14, 6, · · · ,14,16}) are tested. For each word in the tweet, its corresponding binary prefix string representation is used as the feature value. K-means clusters are generated using two different methods. The first method uses the word2vec tool (Mikolov et al., 2013)5. By varying the minimum occurrences (15,10, 201), word vector size (150,100, 200, 500, 10001), cluster size (150,100,200,500, 10001) and sub-sampling threshold (10.00001, 0.001}), different cluster files are generated and tested. Similar to the Brown cluster feature, the name of the cluster that each word belongs to is used as the feature value. The second method uses the GloVe tool to generate global vectors for word representation6. As the GloVe tool does not output any form of clusters, K-mean clusters are generated from the global vectors using the K-means implementation from Apache Spar</context>
</contexts>
<marker>Mikolov, Yih, Zweig, 2013</marker>
<rawString>Tomas Mikolov, Wen-tau Yih, and Geoffrey Zweig. 2013. Linguistic Regularities in Continuous Space Word Representations. In Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pages 746–751, Atlanta, Georgia, June. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Jeffrey Pennington</author>
<author>Richard Socher</author>
<author>Christopher Manning</author>
</authors>
<title>Glove: Global Vectors for Word Representation.</title>
<date>2014</date>
<booktitle>In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP),</booktitle>
<pages>1532--1543</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Doha, Qatar,</location>
<marker>Pennington, Socher, Manning, 2014</marker>
<rawString>Jeffrey Pennington, Richard Socher, and Christopher Manning. 2014. Glove: Global Vectors for Word Representation. In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 1532–1543, Doha, Qatar, October. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Lev Ratinov</author>
<author>Dan Roth</author>
</authors>
<title>Design Challenges and Misconceptions in Named Entity Recognition.</title>
<date>2009</date>
<booktitle>In Proceedings of the Thirteenth Conference on Computational Natural Language Learning (CoNLL-2009),</booktitle>
<pages>147--155</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Boulder, Colorado,</location>
<contexts>
<context position="4336" citStr="Ratinov and Roth, 2009" startWordPosition="683" endWordPosition="686">es of story type are collected and tokenized. Further preprocessing is performed by following the cleaning step described in Turian et al. (2010). This results in a corpus consisting of 76 million sentences. The collected raw tweets are tokenized3 and non-English tweets are removed using langid.py (Lui and Baldwin, 2012), resulting in a total of 14 million tweets. 3 Features This section briefly describes the features used in our system. Besides the features commonly used in traditional NER systems, we focus on the use of word cluster features that have shown to be effective in previous work (Ratinov and Roth, 2009; Turian et al., 2010; Cherry and Guo, 2015). 3.1 Word Feature The current word and its lowercase format are used as features. To provide additional context information, the previous word and next word (in original format) are also used. 3.2 Orthographic Features Orthographic features based on regular expressions are often used in NER systems. We only use the following two orthographic features: InitialCap ([A-Z][a-z].*) and AllCaps ([A-Z]+). In addition, the first character and last two characters of each word are used as features. 3.3 Gazetteer Feature The current word is matched with entrie</context>
</contexts>
<marker>Ratinov, Roth, 2009</marker>
<rawString>Lev Ratinov and Dan Roth. 2009. Design Challenges and Misconceptions in Named Entity Recognition. In Proceedings of the Thirteenth Conference on Computational Natural Language Learning (CoNLL-2009), pages 147–155, Boulder, Colorado, June. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Alan Ritter</author>
<author>Sam Clark</author>
<author>Mausam</author>
<author>Oren Etzioni</author>
</authors>
<title>Named Entity Recognition in Tweets: An Experimental Study.</title>
<date>2011</date>
<booktitle>In Proceedings of the 2011 Conference on Empirical Methods in Natural Language Processing,</booktitle>
<pages>1524--1534</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Edinburgh, Scotland, UK.,</location>
<contexts>
<context position="2880" citStr="Ritter et al., 2011" startWordPosition="448" endWordPosition="451">as a sequential labeling task, using Conditional Random Fields (CRF) as the training algorithm. An additional postprocessing step is applied to further refine the system output. The remainder of this paper is structured as follows. In Section 2, we report on the external resources used by our system and how they are obtained and processed. In Section 3, the features used are described in details. In Section 4, the experiment and official results are presented. Finally, Section 5 summarizes our work. 2 External Resources External resources have shown to improve the performances of Twitter NER (Ritter et al., 2011). Our system uses a variety of external resources, either publicly available, or collected and preprocessed by us. 2.1 Freebase Entity Lists We use the Freebase entity lists provided by the task organizers. For some of the lists that are not provided (e.g. a list of sports facilities), we manually collect them by calling the appropriate Freebase API. 2.2 Unlabeled Corpora We gather unlabeled corpora from three different sources: (1) Pre-trained word vectors generated using the GloVe tool (Pennington et al., 141 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 141–145, B</context>
</contexts>
<marker>Ritter, Clark, Mausam, Etzioni, 2011</marker>
<rawString>Alan Ritter, Sam Clark, Mausam, and Oren Etzioni. 2011. Named Entity Recognition in Tweets: An Experimental Study. In Proceedings of the 2011 Conference on Empirical Methods in Natural Language Processing, pages 1524–1534, Edinburgh, Scotland, UK., July. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Marie Marie</author>
</authors>
<title>Catherine de Marneffe Young-Bum Kim Alan Ritter Wei Xu Tim Baldwin, Bo Han.</title>
<date>2015</date>
<booktitle>Findings of the 2015 Workshop on Noisy Usergenerated Text. In Proceedings of the Workshop on Noisy User-generated Text (WNUT 2015). Association for Computational Linguistics.</booktitle>
<marker>Marie, 2015</marker>
<rawString>Marie Marie Catherine de Marneffe Young-Bum Kim Alan Ritter Wei Xu Tim Baldwin, Bo Han. 2015. Findings of the 2015 Workshop on Noisy Usergenerated Text. In Proceedings of the Workshop on Noisy User-generated Text (WNUT 2015). Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Joseph Turian</author>
<author>Lev-Arie Ratinov</author>
<author>Yoshua Bengio</author>
</authors>
<title>Word Representations: A Simple and General Method for Semi-Supervised Learning.</title>
<date>2010</date>
<booktitle>In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics,</booktitle>
<pages>384--394</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Uppsala, Sweden,</location>
<contexts>
<context position="3859" citStr="Turian et al. (2010)" startWordPosition="604" endWordPosition="607">labeled Corpora We gather unlabeled corpora from three different sources: (1) Pre-trained word vectors generated using the GloVe tool (Pennington et al., 141 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 141–145, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics 2014)1, (2) English Gigaword Fifth Edition2, and (3) raw tweets collected between the period of March 2015 and April 2015. For English Gigaword, all articles of story type are collected and tokenized. Further preprocessing is performed by following the cleaning step described in Turian et al. (2010). This results in a corpus consisting of 76 million sentences. The collected raw tweets are tokenized3 and non-English tweets are removed using langid.py (Lui and Baldwin, 2012), resulting in a total of 14 million tweets. 3 Features This section briefly describes the features used in our system. Besides the features commonly used in traditional NER systems, we focus on the use of word cluster features that have shown to be effective in previous work (Ratinov and Roth, 2009; Turian et al., 2010; Cherry and Guo, 2015). 3.1 Word Feature The current word and its lowercase format are used as featur</context>
</contexts>
<marker>Turian, Ratinov, Bengio, 2010</marker>
<rawString>Joseph Turian, Lev-Arie Ratinov, and Yoshua Bengio. 2010. Word Representations: A Simple and General Method for Semi-Supervised Learning. In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics, pages 384–394, Uppsala, Sweden, July. Association for Computational Linguistics.</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>