<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.083494">
<title confidence="0.969314">
NRC: Infused Phrase Vectors for Named Entity Recognition in Twitter
</title>
<author confidence="0.985589">
Colin Cherry and Hongyu Guo and Chengbi Dai
</author>
<affiliation confidence="0.976123">
National Research Council Canada
</affiliation>
<email confidence="0.420983">
first.last@nrc-cnrc.gc.ca
</email>
<sectionHeader confidence="0.982251" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.999527235294118">
Our submission to the W-NUT Named En-
tity Recognition in Twitter task closely
follows the approach detailed by Cherry
and Guo (2015), who use a discrimi-
native, semi-Markov tagger, augmented
with multiple word representations. We
enhance this approach with updated
gazetteers, and with infused phrase em-
beddings that have been adapted to better
predict the gazetteer membership of each
phrase. Our system achieves a typed F1 of
44.7, resulting in a third-place finish, de-
spite training only on the official training
set. A post-competition analysis indicates
that also training on the provided devel-
opment data improves our performance to
54.2 F1.
</bodyText>
<sectionHeader confidence="0.998989" genericHeader="introduction">
1 Introduction
</sectionHeader>
<bodyText confidence="0.999917155555556">
Named entity recognition (NER) is the task of
finding rigid designators as they appear in free text
and assigning them to coarse types such as per-
son or geo-location (Nadeau and Sekine, 2007).
NER is the first step in many information ex-
traction tasks, but in social media, this task is
extremely challenging. The text to be analyzed
is unedited and noisy, and covers a much more
diverse set of topics than one might expect in
newswire. As such, we are quite interested in
the W-NUT Named Entity Recognition in Twitter
task (Baldwin et al., 2015) as a platform to bench-
mark and drive forward work on NER in social
media.
Our submission to this competition closely
follows Cherry and Guo (2015), who advocate
the use of a semi-Markov tagger trained on-
line with standard discriminative tagging features,
gazetteer matches, Brown clusters, and word em-
beddings. We augment this approach with up-
dated gazetteers, phrase embeddings, and infused
embeddings that have been adapted to better pre-
dict gazetteer membership. Our novel infusion
technique allows us to adapt existing vectors to
NER regardless of their source, by training a type-
level auto-encoder whose hidden layer must pre-
dict the corresponding phrase’s gazetteer member-
ships while also recovering the original vector.
Our submitted system achieved a typed F1 of
44.7, placing third in the competition, while train-
ing only on the provided training data. The
competition organizers provided two development
sets, one (dev) that is close to the training data,
with both train and dev being drawn from the year
2010, and another (dev 2015) that is close to the
test data, with both dev 2015 and test being drawn
from the winter of 2014–2015. We present a post-
competition system that achieves an F1 of 54.2
using the same features and hyper-parameters as
our submitted system, except that our tagger is
also trained on all provided development data. We
close with an analysis of dev 2015’s relation to
the test set, and argue that these results may over-
estimate the impact that a small, in-domain train-
ing set can have on NER performance.
</bodyText>
<sectionHeader confidence="0.988131" genericHeader="method">
2 Data Resources
</sectionHeader>
<bodyText confidence="0.999946866666667">
We make use of two external data resources:
gazetteers and unlabeled tweets. For gazetteers,
we begin with the word lists provided with the
W-NUT baseline system, which appear to be
mostly derived from Freebase. We treat each file
in the lexicon directory as a distinct word list. We
update and augment these lists with our own Free-
base queries in Section 3.2.
We use unannotated tweets to build various
word representations (see Section 3.1). Our unan-
notated corpus collects 98M tweets (1,995M to-
kens) from between May 2011 and April 2012.
The same corpus is used by Cherry and Guo
(2015). These tweets have been tokenized and
post-processed to remove many special Unicode
</bodyText>
<page confidence="0.986034">
54
</page>
<note confidence="0.787611">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 54–60,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<bodyText confidence="0.999990833333333">
characters; they closely resemble those that appear
in the provided training and development sets.
Furthermore, the corpus consists only of tweets in
which the NER system of Ritter et al. (2011) de-
tects at least one entity. The automatic NER tags
are used only to select tweets for inclusion in the
corpus, after which the annotations are discarded.
Filtering our tweets in this way has two immediate
effects: first, each tweet is very likely to contain an
entity mention. Second, the tweets are very long,
with an average of 20.4 tokens per Tweet.
As the test data is drawn from the winter of
2014–2015, we attempted to augment our corpus
with more recent data: 13M unannotated English
tweets drawn from Twitter’s public stream, from
between April 24 and May 6, 2015. As we had
very little recent data, we made no attempt to bias
the corpus to be entity-rich. This corpus of re-
cent tweets has an average tweet length of only
13.8 tokens. Our attempts to use this data to build
word representations did not improve NER perfor-
mance on the 2015 development set, regardless of
whether we used the data on its own or in combi-
nation with our larger corpus.
</bodyText>
<sectionHeader confidence="0.996561" genericHeader="method">
3 Methods
</sectionHeader>
<subsectionHeader confidence="0.999942">
3.1 Base Tagger
</subsectionHeader>
<bodyText confidence="0.999984866666667">
We first summarize the approach of Cherry and
Guo (2015), which we build upon for our system.
Tagger: We tag each tweet independently us-
ing a semi-Markov tagger (Sarawagi and Cohen,
2004), which tags phrasal entities using a single
operation, as opposed to traditional word-based
entity tagging schemes. An example tag sequence,
drawn from the 2010 development data, is shown
in Figure 1. Semi-Markov tagging gives us the
freedom to design features at either the phrase or
the word level, while also simplifying our tag set.
Furthermore, with our semi-Markov tags, we find
we have no need for Markov features that track
previous tag assignments, as our entity labels co-
here naturally. This speeds up tagging dramat-
ically. Semi-Markov tagging also introduces a
hyper-parameter P, the maximum entity length in
tokens.
Training: Our tagger is trained online with
large-margin updates, following a structured
variant of the passive aggressive (PA) algo-
rithm (Crammer et al., 2006). We regularize the
model both with a fixed number of epochs E
through the data, and using PA’s regularization
term C, which is similar to that of an SVM. We
also have the capacity to deploy example-specific
C-parameters, allowing us to assign some exam-
ples more weight during training, which we use
only in post-competition analysis.
Lexical Features: Recall that our semi-Markov
model allows for both word and phrase-level fea-
tures. The vast majority of our features are word-
level, with the representation for a phrase being
the sum of the features of its words. Our word-
level features closely follow the set proposed by
Ratnaparkhi (1996), covering word identity, the
identities of surrounding words within a window
of 2 tokens, and prefixes and suffixes up to three
characters in length. Each word identity feature
has three variants, with the first reporting the orig-
inal word, the second reporting a lowercased ver-
sion, and the third reporting a summary of the
word’s shape (“Mrs.” becomes “Aa.”) All word-
level features also have a variant that appends the
word’s Begin/Inside/Last/Unique position within
its entity. Our phrase-level features report phrase
identity, with lowercased and word shape variants,
along with a bias feature that is always on. Phrase
identity features allow us to memorize tags for
common phrases explicitly. Following the stan-
dard discriminative tagging paradigm, all features
have the tag identity appended to them.
Representation Features: We also produce
word-level features corresponding to a number of
external representations: gazetteer membership,
Brown clusters (Brown et al., 1992) and word em-
beddings. For gazetteers, we first segment the
tweet into longest matching gazetteer phrases, re-
solving overlapping phrases with a greedy left-to-
right walk through the tweet. Each word then gen-
erates a set of features indicating which gazetteers
(if any) contain its phrase. For cluster representa-
tions, we train Brown clusters on our unannotated
corpus, using the implementation by Liang (2005)
to build 1,000 clusters over types that occur with
a minimum frequency of 10. Following Miller
et al. (2004), each word generates indicators for
bit prefixes of its binary cluster signature, for pre-
fixes of length 2, 4 8 and 12. For word embed-
dings, we use an in-house Java re-implementation
of word2vec (Mikolov et al., 2013a) to build 300-
dimensional vector representations for all types
that occur at least 10 times in our unannotated cor-
pus. Each word then reports a real-valued feature
(as opposed to an indicator) for each of the 300
</bodyText>
<page confidence="0.983531">
55
</page>
<figure confidence="0.971356">
Team O Person O O O O Other O O
Ducks sign LW Beleskey to 2-year extension - San Jose Mercury News http://dlvr.it/5RcvP #ANADucks
</figure>
<figureCaption confidence="0.999882">
Figure 1: An example of semi-Markov tagging.
</figureCaption>
<bodyText confidence="0.999951833333333">
dimensions in its vector representation. A single
random vector is created to represent all out-of-
vocabulary words. Our vectors and clusters cover
2.5 million types. Note that we do not include
part-of-speech tags as features, as they were not
found to be useful by Cherry and Guo (2015).
</bodyText>
<subsectionHeader confidence="0.999615">
3.2 Updated Gazetteers
</subsectionHeader>
<bodyText confidence="0.999955076923077">
During development, we found that features in-
volving gazetteers were having a larger impact on
dev than on dev 2015. Therefore, we enhanced
the gazetteers provided with the W-NUT base-
line system with our own Freebase queries, is-
sued between May 6–7, 2015. These updates
are summarized in Table 1. The baseline lexi-
cons for which we could infer Freebase categories
were replaced with updated queries, indicated with
new=no. We also added a number of entirely new
queries (new=yes). Any baseline lexicon that is
not mentioned in Table 1 was left untouched, and
remains included in our updated gazetteers.
</bodyText>
<subsectionHeader confidence="0.999632">
3.3 Phrase Embeddings
</subsectionHeader>
<bodyText confidence="0.999961323529412">
The work of Passos et al. (2014) suggests that em-
beddings built over phrases may be more useful to
NER than those built over words. To test this for
our tagger, we use the phrase finding tool provided
with word2vec to segment our unannotated corpus
into phrases up to 4 tokens in length (Mikolov et
al., 2013b). Their software uses a simple statis-
tic similar to pointwise mutual information to as-
sess whether two tokens should be combined into
a phrase. Token-pairs passing a threshold are
segmented into phrases, creating a new corpus.
Phrases longer than 2 tokens can be generated by
running this process repeatedly, allowing, for ex-
ample, Toronto to merge with Maple Leafs on the
second pass. Once the phrasal corpus has been
created, we run word2vec as usual. There is no
reason the same procedure could not be applied to
Brown clustering, only time constraints prevented
us from doing so.
The resulting phrase embeddings are used in
place or word embeddings, mostly for efficiency
considerations. We assign each word in the tweet
to its longest embedded phrase that matches the
tweet, resolving conflicts with a greedy, left-
to-right matching process. Each word is then
assigned a vector corresponding to its matched
phrase, meaning that the same vector will be re-
peated for each token in its phrase. This has the
property of having words receive different repre-
sentations, depending on their context. Otherwise,
the embedding features are identical to those de-
scribed in Section 3.1. Phrase embeddings enable
more gazetteer matches for gazetteer-infused vec-
tors, which we discuss next.
</bodyText>
<subsectionHeader confidence="0.955378">
3.4 Gazetteer-Infused Phrase Vectors
</subsectionHeader>
<bodyText confidence="0.999981482758621">
We employ an auto-encoder to leverage knowl-
edge derived from domain-specific gazetteers to
make the distributed phrase representations more
relevant to our NER task. In recent years, two
sources of information have been found to be
valuable to boost the performance for NER: dis-
tributed representation learned from a large corpus
and domain-specific lexicons (Turian et al., 2010;
Cherry and Guo, 2015). Research has also shown
that merging these two forms of information can
result in further predictive improvement for an
NER system (Passos et al., 2014). A similar strat-
egy for enhancing word embeddings has also been
demonstrated for sentiment analysis (Tang et al.,
2014). Following this line of research, we aim to
tailor (post-process) the unsupervised phrase em-
beddings, created in Section 3.3, for our NER task,
using an auto-encoder.
The auto-encoder eliminates the need to have
access to the original training data and the vec-
tor training model, requiring only the trained dis-
tributed vectors. In this sense, it can be considered
computationally lighter than the above mentioned
information fusion methods.1 Our approach is in-
spired by Ngiam et al. (2011) and Glorot et al.
(2011), where auto-encoders are efficiently de-
ployed to generate improved features for domains
or modalities that are different from those of its
inputs. Here, we employ an auto-encoder to inject
</bodyText>
<footnote confidence="0.999803">
1In practice, a number of popular sets of pre-
trained embedding vectors, trained with very large cor-
pora, are made available online to the research com-
munity, but without the original training data; such as
some sets from http://nlp.stanford.edu/projects/glove/ and
https://code.google.com/p/word2vec/
</footnote>
<page confidence="0.982174">
56
</page>
<table confidence="0.782476821428571">
Gazetteer # new
architecture.museum 9k no
architecture.structure 111k yes
broadcast.tv channel 1k no
business.brand 9k no
business.consumer company 2k no
business.consumer product 439k no
business.employer 282k yes
business.sponsor 2k no
business.company 393k yes
location.location 1,336k no
location.citytown* 189k yes
location.country 1k no
location.state province region* 66k yes
film.film 262k yes
music.artist 613k yes
music.musical group 195k yes
people.family name 6k no
people.person 3,200k no
business.product line 439k no
sports.professional sports team 1k yes
sports.school sports team 2k yes
sports.sports facility 7k yes
sports.sports league 4k no
sports.sports team 33k no
tv.tv network 3k no
tv.tv program 74k no
tv.tv series episode 1,316k yes
</table>
<tableCaption confidence="0.797996">
Table 1: The Freebase gazetteers we either up-
</tableCaption>
<bodyText confidence="0.9979535">
dated (new=no) or added (new=yes) to the baseline
gazetteers. Freebase categories can be recovered
by replacing “.” with “/”. Gazetteers marked with
a * were extracted from /location/mailing address
using the indicated property name.
relevant entity type information derived from our
gazetteers into the pre-trained phrase representa-
tions.
Using learned phrase vectors as input, the auto-
encoder’s goal is to reconstruct both the provided
input vector V and its entity types, as derived from
the collected gazetteers. In our experiments, we
assume the entity membership vector has a 0-1 en-
coding. That is, if there are G lexicon types, then
it has length G, where a 1 indicates membership
in the corresponding gazetteer. We implement the
squared error as the reconstruction cost criterion
for the auto-encoder training:
</bodyText>
<equation confidence="0.912958">
[V ; G] = f(Wdf(WeV + be) + bd)
</equation>
<bodyText confidence="0.999987324324324">
where f denotes the hyperbolic tanh function,
while be and bd are the biases for the encoder and
decoder, respectively. We initialize the parame-
ters, namely the decoder matrix Wd and encoder
matrix We, by randomly sampling each value from
a uniform distribution [-0.1, +0.1]. Our experi-
ments use a learning rate of 0.001 and a moment of
0.9. We found the optimal size for the hidden en-
coding layer to be 200 nodes. The auto-encoder is
trained using Stochastic Gradient Descent (SGD)
with 100 iterations, which converges very well for
our data.
With the above experimental settings, our
Gazetteer-Infused phrase vectors are created with
two stages. During the initial phase, we select the
69,329 phrases that are shared by both the phrase
vectors and the collected gazetteers. The resulting
set of phrases is a very small fraction of the to-
tal of 4.5M vector entries created in Section 3.3.
Consequently, the overwhelming number of neg-
ative examples causes a highly imbalanced class
distribution problem for the gazetteer membership
component of our auto-encoder (Guo and Viktor,
2004). To cope with this skewed class challenge,
we randomly down-sample the negative training
data to balance the negative and positive instances.
These rebalanced data are then fed into the auto-
encoder to optimize the parameter matrices. In
the second stage, the trained auto-encoder is used
to generate a new vector [V ; G] for every phrase
created in Section 3.3. These gazetteer-infused
phrase vectors include a decoded version of the
original embedding V , as well as explicit soft pre-
dictions of gazetteer membership in G. Note that
we apply this process for all 45 of our gazetteers,
and not just those that correspond directly to the
types tagged in this task.
</bodyText>
<sectionHeader confidence="0.999953" genericHeader="evaluation">
4 Results
</sectionHeader>
<bodyText confidence="0.9999875">
We have collected results for our submitted sys-
tem, along with some salient pre- and post- com-
petition variants in Table 2. We discuss these re-
sults in detail below.
</bodyText>
<subsectionHeader confidence="0.997475">
4.1 Competition Results
</subsectionHeader>
<bodyText confidence="0.999797333333333">
Our submitted system is shown as [A]+[U], and in-
cludes all of the features described in Section 3. It
achieves our highest results on both dev 2015 and
the average of dev and dev 2015, and its perfor-
mance on test was sufficient to place third in the
competition.
</bodyText>
<page confidence="0.998265">
57
</page>
<table confidence="0.999618615384615">
System P dev F dev 2015 Avg F P test F Rnk
R P R F R
Baseline 57.0 44.4 49.9 38.5 30.9 34.3 42.1 35.6 29.1 32.0 -
Our Baseline 64.6 38.5 48.2 47.7 27.2 34.7 41.5 44.4 23.3 30.6 8
C&amp;G 2015 68.6 50.3 58.0 56.4 41.9 48.1 53.1 49.8 36.5 42.1 5
Inc. Regularization 70.7 50.8 59.2 57.2 42.3 48.6 53.9 51.4 36.8 42.9 4
[P]hrase vectors 69.4 50.8 58.7 58.0 42.7 49.2 53.9 52.6 37.8 44.0 3
[A]dapted vectors 70.0 51.7 59.5 59.9 42.3 49.6 54.5 52.2 37.5 43.7 4
[U]pdated gazetteers 68.5 51.4 58.8 57.4 42.7 49.0 53.9 52.0 37.4 43.5 4
[P]+[U] 73.7 54.2 62.5 59.7 44.1 50.7 56.6 53.2 38.9 44.9 3
[A]+[U] (Submitted) 72.8 53.4 61.6 62.4 44.5 51.9 56.8 53.2 38.6 44.7 3
+ dev - - - 59.0 44.5 50.7 - 54.9 41.0 46.9 3
+ dev + dev 2015 - - - - - - - 62.5 47.8 54.2 2
</table>
<tableCaption confidence="0.915471333333333">
Table 2: Experimental results for variants of our system, reporting Precision, Recall and balanced F-
measure. The Avg F column lists the average F-measure across dev and dev 2015, which was our model
selection criterion. The Rnk column lists the retro-active rank of each system in the competition.
</tableCaption>
<subsectionHeader confidence="0.992568">
4.2 Ablation
</subsectionHeader>
<bodyText confidence="0.99997076744186">
The systems above [A]+[U] are intended to
demonstrate our development process. Our base-
line is our attempt to re-implement the provided
baseline in our code base, and includes all lexical
features and the baseline gazetteers.
C&amp;G 2015 adds Brown clusters and word em-
beddings to create a complete re-implementation
of Cherry and Guo (2015). We can see that these
representations have a huge impact on NER per-
formance for all dev and test sets.
We then performed a careful hyper-parameter
sweep using the two provided development sets,
resulting in the Inc. Regularization system. The
hyper-parameters suggested by Cherry and Guo
(2015) (E=10, C=0.01, P=10) were selected to
work well with and without representations. We
found that once we have committed to using rep-
resentations, the tagger benefits from increased
regularization, so long as we allow the model to
converge (E=30, C=0.001, P=8). Although we
revisited these settings periodically, these hyper-
parameters have proved to be quite stable, and we
use them for all remaining experiments.
The next three systems test the three extensions
described in Section 3. Neither [P]hrase vectors
nor [U]pdated gazetteers were able to improve
both dev and dev 2015 when applied alone, while
the [A]dapted vectors did boost performance on
both sets, increasing average F-measure by 0.6. In
particular, the adapted vectors improved the rare
entity types such as movie and sports team. Unfor-
tunately, these improvements do not seem to carry
over to the test set.
As we combine the ideas with [P]+[U] and
[A]+[U], we see even larger improvements on
both development sets. Note that adapted vec-
tors implicitly include phrase vectors, as those are
the vectors that have been adapted. These ideas
may work better in combination because both our
phrase vectors and our updated gazetteers include
many noisy phrasal entries, but their sources of
noise are independent, allowing one to compen-
sate for the other.
</bodyText>
<subsectionHeader confidence="0.99963">
4.3 Post-Competition Results
</subsectionHeader>
<bodyText confidence="0.99999365">
All systems discussed thus far have been trained
only on the official training data. The final two
systems in Table 2 test the impact of adding dev
(599 tweets from 2010) and dev 2015 (420 tweets
from 2014–2015) to the 1,795 training tweets.
When training with dev 2015, we take advan-
tage of our system’s data-weighting capabilities
to assign these examples twice as much weight;
this hyper-parameter was selected only based on
dev 2015’s relatively small size, and we did not
test any other values for it. As one can see, both
development sets have a significant impact on test
performance, with dev 2015 producing a 7.3 point
improvement in F-measure, eclipsing the impact
of all other enhancements to our system.
We chose not to include dev in our final system
because it did not appear to have a positive impact
on dev 2015. We chose not to include dev 2015
in our final system because cross validation exper-
iments, where we train including half of dev 2015
</bodyText>
<page confidence="0.995679">
58
</page>
<bodyText confidence="0.999800833333333">
and then test on the other half, indicated that its
impact would be minimal (less than 1 point of
F-measure), and we did not want to discard the
safety net that a held-out development set provides
when selecting a final system. In retrospect, this
was a fairly large mistake.
</bodyText>
<subsectionHeader confidence="0.999431">
4.4 Dev-Test Data Analysis
</subsectionHeader>
<bodyText confidence="0.998232761904762">
Does this mean that 420 examples drawn from a
time period close to that of the test set will consis-
tently provide 7 points of F-measure? We do not
believe so, for at least two reasons: time overlap
and Twitter bots.
Time overlap: dev 2015 and test appear to have
both been drawn from overlapping periods of time.
Though the tweets provided for development and
test were not dated, we can infer the date spans
from various bots that tweet the date, producing
tweets like:
@ABCD http://t.co/MA3WYTR72Q
February 02 , 2015 at 10:57 PM
throughout both sets. Using the regular expression
“(December|(Jan|Febr)uary) [0-9]+ , 201[4-5] at”
we were able to find tweets between December 11,
2014 and February 4, 2015 in dev 2015, and be-
tween December 12, 2014 and February 5, 2015
in test. This means that both sets contain the same
major holidays, such as Christmas, and sporting
events, such as Super Bowl 49. Accordingly, our
post-competition system sees its largest improve-
ments in the types sports team and other (which
covers many event names). These sorts of im-
provements are not necessarily indicative of how
a system trained on data from the past will per-
form on new data, which is a common use case for
NER. This also highlights the importance of hav-
ing an NER system’s training data be drawn from
throughout the year. The official training data has
no mention of the Super Bowl and very few of
Christmas, despite these being yearly events.
Twitter bots: bots are common in both dev 2015
and test, but much less prominent in train and dev,
perhaps reflecting an overall change in twitter traf-
fic between 2010 and 2015. For the most part,
bots are harmless, and a system should be tested in
terms of its ability to ignore these sources of noise.
However, some bots tweet entities in an extremely
formulaic manner, and a discriminative NER sys-
tem needs to see very few tweets from such a bot
in order to tag it consistently. One such example
from this competition is the horoscope bot, which
produces tweets that look like:
Your boss may be critical of your easy-
going attitude at work t ... More for
Cancer http://t.co/74bwPzVbiB
This bot can be detected reliably with the regu-
lar expression “[.][.][.] More for [A-Z]”. In the
gold-standard annotations, tweets from this bot
consistently have the astrological sign (Cancer in
this case) tagged as other. While this bot ap-
pears only 4 times in dev 2015, it appears 23 times
in test. If we remove these 23 tweets from the
test set, our submitted system increases is per-
formance to a Precision / Recall / F-measure of
53.5 / 40.0 / 45.8, while our best post-competition
system decreases to 61.0 / 46.2 / 52.6, narrowing
the gap in F-measure by 2.6 points. The ability
to extract entities from formulaic bots such as this
one could be useful, but the core purpose of NER
technology is to enable the extraction of informa-
tion from human-written text.
</bodyText>
<sectionHeader confidence="0.998835" genericHeader="conclusions">
5 Conclusion
</sectionHeader>
<bodyText confidence="0.999995">
We have summarized our entry to the first W-NUT
Named Entity Recognition in Twitter task. Our
entry extends the work of Cherry and Guo
(2015) with updated lexicons, phrase embeddings,
and gazetteer-infused phrase embeddings. Our
gazetteer infusion technique is novel in that it
allows us to adapt existing vectors, regardless
of their source. Taken together with improved
hyper-parameters, these extensions improve the
approach of Cherry and Guo (2015) by 2.6 F-
measure on a completely blind test. Our final sub-
mission achieves a test F-measure of 44.7, placing
third in the competition, and could have achieved
an F-measure of 54.2 had we included all devel-
opment data as training data. We have also pre-
sented a discussion of how the most recent devel-
opment set relates to the test set, arguing that these
results likely over-estimate the impact of a small,
in-domain training set on NER performance.
</bodyText>
<sectionHeader confidence="0.999478" genericHeader="references">
References
</sectionHeader>
<reference confidence="0.998975">
Timothy Baldwin, Marie Catherine de Marneffe,
Bo Han, Young-Bum Kim, Alan Ritter, and Wei
Xu. 2015. Shared tasks of the 2015 workshop on
noisy user-generated text: Twitter lexical normal-
ization and named entity recognition. In Proceed-
ings of the Workshop on Noisy User-generated Text
(WNUT 2015), Beijing, China.
</reference>
<page confidence="0.982905">
59
</page>
<reference confidence="0.99989">
Peter F Brown, Peter V Desouza, Robert L Mercer,
Vincent J Della Pietra, and Jenifer C Lai. 1992.
Class-based n-gram models of natural language.
Computational linguistics, 18(4):467–479.
Colin Cherry and Hongyu Guo. 2015. The unreason-
able effectiveness of word representations for Twit-
ter named entity recognition. In HLT-NAACL.
Koby Crammer, Ofer Dekel, Joseph Keshet, Shai
Shalev-Shwartz, and Yoram Singer. 2006. Online
passive-aggressive algorithms. Journal of Machine
Learning Research.
Xavier Glorot, Antoine Bordes, and Yoshua Bengio.
2011. Domain adaptation for large-scale sentiment
classification: A deep learning approach. In ICML,
pages 513–520.
Hongyu Guo and Herna L. Viktor. 2004. Learning
from imbalanced data sets with boosting and data
generation: the databoost-im approach. SIGKDD
Explorations, 6(1):30–39.
Percy Liang. 2005. Semi-supervised learning for nat-
ural language. Ph.D. thesis, Massachusetts Institute
of Technology.
Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey
Dean. 2013a. Efficient estimation of word represen-
tations in vector space. In ICLR Workshop.
Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg S Cor-
rado, and Jeff Dean. 2013b. Distributed representa-
tions of words and phrases and their compositional-
ity. In Advances in Neural Information Processing
Systems, pages 3111–3119.
Scott Miller, Jethran Guinness, and Alex Zamanian.
2004. Name tagging with word clusters and discrim-
inative training. In HLT-NAACL, pages 337–342.
David Nadeau and Satoshi Sekine. 2007. A sur-
vey of named entity recognition and classification.
Lingvisticae Investigationes, 30(1):3–26.
Jiquan Ngiam, Aditya Khosla, Mingyu Kim, Juhan
Nam, Honglak Lee, and Andrew Y. Ng. 2011. Mul-
timodal deep learning. In ICML, pages 689–696.
Alexandre Passos, Vineet Kumar, and Andrew McCal-
lum. 2014. Lexicon infused phrase embeddings for
named entity resolution. In CoNLL, pages 78–86.
Adwait Ratnaparkhi. 1996. A maximum entropy
model for part-of-speech tagging. In EMNLP, pages
133–142.
Alan Ritter, Sam Clark, Mausam, and Oren Etzioni.
2011. Named entity recognition in tweets: An ex-
perimental study. In EMNLP, pages 1524–1534, Ed-
inburgh, Scotland, UK.
Sunita Sarawagi and William W Cohen. 2004. Semi-
markov conditional random fields for information
extraction. In NIPS, pages 1185–1192.
Duyu Tang, Furu Wei, Nan Yang, Ming Zhou, Ting
Liu, and Bing Qin. 2014. Learning sentiment-
specific word embedding for twitter sentiment clas-
sification. In ACL, pages 1555–1565.
Joseph Turian, Lev Ratinov, and Yoshua Bengio. 2010.
Word representations: a simple and general method
for semi-supervised learning. In ACL, pages 384–
394.
</reference>
<page confidence="0.998375">
60
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.589482">
<title confidence="0.999566">NRC: Infused Phrase Vectors for Named Entity Recognition in Twitter</title>
<author confidence="0.999711">Cherry Guo</author>
<affiliation confidence="0.996438">National Research Council</affiliation>
<email confidence="0.991585">first.last@nrc-cnrc.gc.ca</email>
<abstract confidence="0.977158">Our submission to the W-NUT Named Entity Recognition in Twitter task closely follows the approach detailed by Cherry and Guo (2015), who use a discriminative, semi-Markov tagger, augmented with multiple word representations. We enhance this approach with updated gazetteers, and with infused phrase embeddings that have been adapted to better predict the gazetteer membership of each phrase. Our system achieves a typed F1 of 44.7, resulting in a third-place finish, despite training only on the official training set. A post-competition analysis indicates that also training on the provided development data improves our performance to 54.2 F1.</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="true">
<authors>
<author>Timothy Baldwin</author>
<author>Marie Catherine de Marneffe</author>
<author>Bo Han</author>
<author>Young-Bum Kim</author>
<author>Alan Ritter</author>
<author>Wei Xu</author>
</authors>
<title>Shared tasks of the 2015 workshop on noisy user-generated text: Twitter lexical normalization and named entity recognition.</title>
<date>2015</date>
<booktitle>In Proceedings of the Workshop on Noisy User-generated Text (WNUT 2015),</booktitle>
<location>Beijing, China.</location>
<marker>Baldwin, de Marneffe, Han, Kim, Ritter, Xu, 2015</marker>
<rawString>Timothy Baldwin, Marie Catherine de Marneffe, Bo Han, Young-Bum Kim, Alan Ritter, and Wei Xu. 2015. Shared tasks of the 2015 workshop on noisy user-generated text: Twitter lexical normalization and named entity recognition. In Proceedings of the Workshop on Noisy User-generated Text (WNUT 2015), Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Peter F Brown</author>
<author>Peter V Desouza</author>
<author>Robert L Mercer</author>
<author>Vincent J Della Pietra</author>
<author>Jenifer C Lai</author>
</authors>
<title>Class-based n-gram models of natural language.</title>
<date>1992</date>
<journal>Computational linguistics,</journal>
<volume>18</volume>
<issue>4</issue>
<contexts>
<context position="7562" citStr="Brown et al., 1992" startWordPosition="1231" endWordPosition="1234">ll wordlevel features also have a variant that appends the word’s Begin/Inside/Last/Unique position within its entity. Our phrase-level features report phrase identity, with lowercased and word shape variants, along with a bias feature that is always on. Phrase identity features allow us to memorize tags for common phrases explicitly. Following the standard discriminative tagging paradigm, all features have the tag identity appended to them. Representation Features: We also produce word-level features corresponding to a number of external representations: gazetteer membership, Brown clusters (Brown et al., 1992) and word embeddings. For gazetteers, we first segment the tweet into longest matching gazetteer phrases, resolving overlapping phrases with a greedy left-toright walk through the tweet. Each word then generates a set of features indicating which gazetteers (if any) contain its phrase. For cluster representations, we train Brown clusters on our unannotated corpus, using the implementation by Liang (2005) to build 1,000 clusters over types that occur with a minimum frequency of 10. Following Miller et al. (2004), each word generates indicators for bit prefixes of its binary cluster signature, f</context>
</contexts>
<marker>Brown, Desouza, Mercer, Pietra, Lai, 1992</marker>
<rawString>Peter F Brown, Peter V Desouza, Robert L Mercer, Vincent J Della Pietra, and Jenifer C Lai. 1992. Class-based n-gram models of natural language. Computational linguistics, 18(4):467–479.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Colin Cherry</author>
<author>Hongyu Guo</author>
</authors>
<title>The unreasonable effectiveness of word representations for Twitter named entity recognition.</title>
<date>2015</date>
<booktitle>In HLT-NAACL.</booktitle>
<contexts>
<context position="1534" citStr="Cherry and Guo (2015)" startWordPosition="245" endWordPosition="248">as they appear in free text and assigning them to coarse types such as person or geo-location (Nadeau and Sekine, 2007). NER is the first step in many information extraction tasks, but in social media, this task is extremely challenging. The text to be analyzed is unedited and noisy, and covers a much more diverse set of topics than one might expect in newswire. As such, we are quite interested in the W-NUT Named Entity Recognition in Twitter task (Baldwin et al., 2015) as a platform to benchmark and drive forward work on NER in social media. Our submission to this competition closely follows Cherry and Guo (2015), who advocate the use of a semi-Markov tagger trained online with standard discriminative tagging features, gazetteer matches, Brown clusters, and word embeddings. We augment this approach with updated gazetteers, phrase embeddings, and infused embeddings that have been adapted to better predict gazetteer membership. Our novel infusion technique allows us to adapt existing vectors to NER regardless of their source, by training a typelevel auto-encoder whose hidden layer must predict the corresponding phrase’s gazetteer memberships while also recovering the original vector. Our submitted syste</context>
<context position="3537" citStr="Cherry and Guo (2015)" startWordPosition="579" endWordPosition="582">R performance. 2 Data Resources We make use of two external data resources: gazetteers and unlabeled tweets. For gazetteers, we begin with the word lists provided with the W-NUT baseline system, which appear to be mostly derived from Freebase. We treat each file in the lexicon directory as a distinct word list. We update and augment these lists with our own Freebase queries in Section 3.2. We use unannotated tweets to build various word representations (see Section 3.1). Our unannotated corpus collects 98M tweets (1,995M tokens) from between May 2011 and April 2012. The same corpus is used by Cherry and Guo (2015). These tweets have been tokenized and post-processed to remove many special Unicode 54 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 54–60, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics characters; they closely resemble those that appear in the provided training and development sets. Furthermore, the corpus consists only of tweets in which the NER system of Ritter et al. (2011) detects at least one entity. The automatic NER tags are used only to select tweets for inclusion in the corpus, after which the annotations are discarded. Fi</context>
<context position="5019" citStr="Cherry and Guo (2015)" startWordPosition="832" endWordPosition="835">pted to augment our corpus with more recent data: 13M unannotated English tweets drawn from Twitter’s public stream, from between April 24 and May 6, 2015. As we had very little recent data, we made no attempt to bias the corpus to be entity-rich. This corpus of recent tweets has an average tweet length of only 13.8 tokens. Our attempts to use this data to build word representations did not improve NER performance on the 2015 development set, regardless of whether we used the data on its own or in combination with our larger corpus. 3 Methods 3.1 Base Tagger We first summarize the approach of Cherry and Guo (2015), which we build upon for our system. Tagger: We tag each tweet independently using a semi-Markov tagger (Sarawagi and Cohen, 2004), which tags phrasal entities using a single operation, as opposed to traditional word-based entity tagging schemes. An example tag sequence, drawn from the 2010 development data, is shown in Figure 1. Semi-Markov tagging gives us the freedom to design features at either the phrase or the word level, while also simplifying our tag set. Furthermore, with our semi-Markov tags, we find we have no need for Markov features that track previous tag assignments, as our ent</context>
<context position="8976" citStr="Cherry and Guo (2015)" startWordPosition="1469" endWordPosition="1472">types that occur at least 10 times in our unannotated corpus. Each word then reports a real-valued feature (as opposed to an indicator) for each of the 300 55 Team O Person O O O O Other O O Ducks sign LW Beleskey to 2-year extension - San Jose Mercury News http://dlvr.it/5RcvP #ANADucks Figure 1: An example of semi-Markov tagging. dimensions in its vector representation. A single random vector is created to represent all out-ofvocabulary words. Our vectors and clusters cover 2.5 million types. Note that we do not include part-of-speech tags as features, as they were not found to be useful by Cherry and Guo (2015). 3.2 Updated Gazetteers During development, we found that features involving gazetteers were having a larger impact on dev than on dev 2015. Therefore, we enhanced the gazetteers provided with the W-NUT baseline system with our own Freebase queries, issued between May 6–7, 2015. These updates are summarized in Table 1. The baseline lexicons for which we could infer Freebase categories were replaced with updated queries, indicated with new=no. We also added a number of entirely new queries (new=yes). Any baseline lexicon that is not mentioned in Table 1 was left untouched, and remains included</context>
<context position="11656" citStr="Cherry and Guo, 2015" startWordPosition="1903" endWordPosition="1906">erwise, the embedding features are identical to those described in Section 3.1. Phrase embeddings enable more gazetteer matches for gazetteer-infused vectors, which we discuss next. 3.4 Gazetteer-Infused Phrase Vectors We employ an auto-encoder to leverage knowledge derived from domain-specific gazetteers to make the distributed phrase representations more relevant to our NER task. In recent years, two sources of information have been found to be valuable to boost the performance for NER: distributed representation learned from a large corpus and domain-specific lexicons (Turian et al., 2010; Cherry and Guo, 2015). Research has also shown that merging these two forms of information can result in further predictive improvement for an NER system (Passos et al., 2014). A similar strategy for enhancing word embeddings has also been demonstrated for sentiment analysis (Tang et al., 2014). Following this line of research, we aim to tailor (post-process) the unsupervised phrase embeddings, created in Section 3.3, for our NER task, using an auto-encoder. The auto-encoder eliminates the need to have access to the original training data and the vector training model, requiring only the trained distributed vector</context>
<context position="18209" citStr="Cherry and Guo (2015)" startWordPosition="2987" endWordPosition="2990">s for variants of our system, reporting Precision, Recall and balanced Fmeasure. The Avg F column lists the average F-measure across dev and dev 2015, which was our model selection criterion. The Rnk column lists the retro-active rank of each system in the competition. 4.2 Ablation The systems above [A]+[U] are intended to demonstrate our development process. Our baseline is our attempt to re-implement the provided baseline in our code base, and includes all lexical features and the baseline gazetteers. C&amp;G 2015 adds Brown clusters and word embeddings to create a complete re-implementation of Cherry and Guo (2015). We can see that these representations have a huge impact on NER performance for all dev and test sets. We then performed a careful hyper-parameter sweep using the two provided development sets, resulting in the Inc. Regularization system. The hyper-parameters suggested by Cherry and Guo (2015) (E=10, C=0.01, P=10) were selected to work well with and without representations. We found that once we have committed to using representations, the tagger benefits from increased regularization, so long as we allow the model to converge (E=30, C=0.001, P=8). Although we revisited these settings period</context>
<context position="24191" citStr="Cherry and Guo (2015)" startWordPosition="4004" endWordPosition="4007">t. If we remove these 23 tweets from the test set, our submitted system increases is performance to a Precision / Recall / F-measure of 53.5 / 40.0 / 45.8, while our best post-competition system decreases to 61.0 / 46.2 / 52.6, narrowing the gap in F-measure by 2.6 points. The ability to extract entities from formulaic bots such as this one could be useful, but the core purpose of NER technology is to enable the extraction of information from human-written text. 5 Conclusion We have summarized our entry to the first W-NUT Named Entity Recognition in Twitter task. Our entry extends the work of Cherry and Guo (2015) with updated lexicons, phrase embeddings, and gazetteer-infused phrase embeddings. Our gazetteer infusion technique is novel in that it allows us to adapt existing vectors, regardless of their source. Taken together with improved hyper-parameters, these extensions improve the approach of Cherry and Guo (2015) by 2.6 Fmeasure on a completely blind test. Our final submission achieves a test F-measure of 44.7, placing third in the competition, and could have achieved an F-measure of 54.2 had we included all development data as training data. We have also presented a discussion of how the most re</context>
</contexts>
<marker>Cherry, Guo, 2015</marker>
<rawString>Colin Cherry and Hongyu Guo. 2015. The unreasonable effectiveness of word representations for Twitter named entity recognition. In HLT-NAACL.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Koby Crammer</author>
<author>Ofer Dekel</author>
<author>Joseph Keshet</author>
<author>Shai Shalev-Shwartz</author>
<author>Yoram Singer</author>
</authors>
<title>Online passive-aggressive algorithms.</title>
<date>2006</date>
<journal>Journal of Machine Learning Research.</journal>
<contexts>
<context position="5939" citStr="Crammer et al., 2006" startWordPosition="977" endWordPosition="980">pment data, is shown in Figure 1. Semi-Markov tagging gives us the freedom to design features at either the phrase or the word level, while also simplifying our tag set. Furthermore, with our semi-Markov tags, we find we have no need for Markov features that track previous tag assignments, as our entity labels cohere naturally. This speeds up tagging dramatically. Semi-Markov tagging also introduces a hyper-parameter P, the maximum entity length in tokens. Training: Our tagger is trained online with large-margin updates, following a structured variant of the passive aggressive (PA) algorithm (Crammer et al., 2006). We regularize the model both with a fixed number of epochs E through the data, and using PA’s regularization term C, which is similar to that of an SVM. We also have the capacity to deploy example-specific C-parameters, allowing us to assign some examples more weight during training, which we use only in post-competition analysis. Lexical Features: Recall that our semi-Markov model allows for both word and phrase-level features. The vast majority of our features are wordlevel, with the representation for a phrase being the sum of the features of its words. Our wordlevel features closely foll</context>
</contexts>
<marker>Crammer, Dekel, Keshet, Shalev-Shwartz, Singer, 2006</marker>
<rawString>Koby Crammer, Ofer Dekel, Joseph Keshet, Shai Shalev-Shwartz, and Yoram Singer. 2006. Online passive-aggressive algorithms. Journal of Machine Learning Research.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Xavier Glorot</author>
<author>Antoine Bordes</author>
<author>Yoshua Bengio</author>
</authors>
<title>Domain adaptation for large-scale sentiment classification: A deep learning approach.</title>
<date>2011</date>
<booktitle>In ICML,</booktitle>
<pages>513--520</pages>
<contexts>
<context position="12445" citStr="Glorot et al. (2011)" startWordPosition="2030" endWordPosition="2033">ategy for enhancing word embeddings has also been demonstrated for sentiment analysis (Tang et al., 2014). Following this line of research, we aim to tailor (post-process) the unsupervised phrase embeddings, created in Section 3.3, for our NER task, using an auto-encoder. The auto-encoder eliminates the need to have access to the original training data and the vector training model, requiring only the trained distributed vectors. In this sense, it can be considered computationally lighter than the above mentioned information fusion methods.1 Our approach is inspired by Ngiam et al. (2011) and Glorot et al. (2011), where auto-encoders are efficiently deployed to generate improved features for domains or modalities that are different from those of its inputs. Here, we employ an auto-encoder to inject 1In practice, a number of popular sets of pretrained embedding vectors, trained with very large corpora, are made available online to the research community, but without the original training data; such as some sets from http://nlp.stanford.edu/projects/glove/ and https://code.google.com/p/word2vec/ 56 Gazetteer # new architecture.museum 9k no architecture.structure 111k yes broadcast.tv channel 1k no busin</context>
</contexts>
<marker>Glorot, Bordes, Bengio, 2011</marker>
<rawString>Xavier Glorot, Antoine Bordes, and Yoshua Bengio. 2011. Domain adaptation for large-scale sentiment classification: A deep learning approach. In ICML, pages 513–520.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Hongyu Guo</author>
<author>Herna L Viktor</author>
</authors>
<title>Learning from imbalanced data sets with boosting and data generation: the databoost-im approach.</title>
<date>2004</date>
<journal>SIGKDD Explorations,</journal>
<volume>6</volume>
<issue>1</issue>
<contexts>
<context position="15679" citStr="Guo and Viktor, 2004" startWordPosition="2525" endWordPosition="2528">dient Descent (SGD) with 100 iterations, which converges very well for our data. With the above experimental settings, our Gazetteer-Infused phrase vectors are created with two stages. During the initial phase, we select the 69,329 phrases that are shared by both the phrase vectors and the collected gazetteers. The resulting set of phrases is a very small fraction of the total of 4.5M vector entries created in Section 3.3. Consequently, the overwhelming number of negative examples causes a highly imbalanced class distribution problem for the gazetteer membership component of our auto-encoder (Guo and Viktor, 2004). To cope with this skewed class challenge, we randomly down-sample the negative training data to balance the negative and positive instances. These rebalanced data are then fed into the autoencoder to optimize the parameter matrices. In the second stage, the trained auto-encoder is used to generate a new vector [V ; G] for every phrase created in Section 3.3. These gazetteer-infused phrase vectors include a decoded version of the original embedding V , as well as explicit soft predictions of gazetteer membership in G. Note that we apply this process for all 45 of our gazetteers, and not just </context>
</contexts>
<marker>Guo, Viktor, 2004</marker>
<rawString>Hongyu Guo and Herna L. Viktor. 2004. Learning from imbalanced data sets with boosting and data generation: the databoost-im approach. SIGKDD Explorations, 6(1):30–39.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Percy Liang</author>
</authors>
<title>Semi-supervised learning for natural language.</title>
<date>2005</date>
<tech>Ph.D. thesis,</tech>
<institution>Massachusetts Institute of Technology.</institution>
<contexts>
<context position="7969" citStr="Liang (2005)" startWordPosition="1297" endWordPosition="1298"> tag identity appended to them. Representation Features: We also produce word-level features corresponding to a number of external representations: gazetteer membership, Brown clusters (Brown et al., 1992) and word embeddings. For gazetteers, we first segment the tweet into longest matching gazetteer phrases, resolving overlapping phrases with a greedy left-toright walk through the tweet. Each word then generates a set of features indicating which gazetteers (if any) contain its phrase. For cluster representations, we train Brown clusters on our unannotated corpus, using the implementation by Liang (2005) to build 1,000 clusters over types that occur with a minimum frequency of 10. Following Miller et al. (2004), each word generates indicators for bit prefixes of its binary cluster signature, for prefixes of length 2, 4 8 and 12. For word embeddings, we use an in-house Java re-implementation of word2vec (Mikolov et al., 2013a) to build 300- dimensional vector representations for all types that occur at least 10 times in our unannotated corpus. Each word then reports a real-valued feature (as opposed to an indicator) for each of the 300 55 Team O Person O O O O Other O O Ducks sign LW Beleskey </context>
</contexts>
<marker>Liang, 2005</marker>
<rawString>Percy Liang. 2005. Semi-supervised learning for natural language. Ph.D. thesis, Massachusetts Institute of Technology.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Tomas Mikolov</author>
<author>Kai Chen</author>
<author>Greg Corrado</author>
<author>Jeffrey Dean</author>
</authors>
<title>Efficient estimation of word representations in vector space.</title>
<date>2013</date>
<booktitle>In ICLR Workshop.</booktitle>
<contexts>
<context position="8295" citStr="Mikolov et al., 2013" startWordPosition="1352" endWordPosition="1355">ng overlapping phrases with a greedy left-toright walk through the tweet. Each word then generates a set of features indicating which gazetteers (if any) contain its phrase. For cluster representations, we train Brown clusters on our unannotated corpus, using the implementation by Liang (2005) to build 1,000 clusters over types that occur with a minimum frequency of 10. Following Miller et al. (2004), each word generates indicators for bit prefixes of its binary cluster signature, for prefixes of length 2, 4 8 and 12. For word embeddings, we use an in-house Java re-implementation of word2vec (Mikolov et al., 2013a) to build 300- dimensional vector representations for all types that occur at least 10 times in our unannotated corpus. Each word then reports a real-valued feature (as opposed to an indicator) for each of the 300 55 Team O Person O O O O Other O O Ducks sign LW Beleskey to 2-year extension - San Jose Mercury News http://dlvr.it/5RcvP #ANADucks Figure 1: An example of semi-Markov tagging. dimensions in its vector representation. A single random vector is created to represent all out-ofvocabulary words. Our vectors and clusters cover 2.5 million types. Note that we do not include part-of-spee</context>
<context position="9934" citStr="Mikolov et al., 2013" startWordPosition="1631" endWordPosition="1634">icons for which we could infer Freebase categories were replaced with updated queries, indicated with new=no. We also added a number of entirely new queries (new=yes). Any baseline lexicon that is not mentioned in Table 1 was left untouched, and remains included in our updated gazetteers. 3.3 Phrase Embeddings The work of Passos et al. (2014) suggests that embeddings built over phrases may be more useful to NER than those built over words. To test this for our tagger, we use the phrase finding tool provided with word2vec to segment our unannotated corpus into phrases up to 4 tokens in length (Mikolov et al., 2013b). Their software uses a simple statistic similar to pointwise mutual information to assess whether two tokens should be combined into a phrase. Token-pairs passing a threshold are segmented into phrases, creating a new corpus. Phrases longer than 2 tokens can be generated by running this process repeatedly, allowing, for example, Toronto to merge with Maple Leafs on the second pass. Once the phrasal corpus has been created, we run word2vec as usual. There is no reason the same procedure could not be applied to Brown clustering, only time constraints prevented us from doing so. The resulting </context>
</contexts>
<marker>Mikolov, Chen, Corrado, Dean, 2013</marker>
<rawString>Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. 2013a. Efficient estimation of word representations in vector space. In ICLR Workshop.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Tomas Mikolov</author>
<author>Ilya Sutskever</author>
<author>Kai Chen</author>
<author>Greg S Corrado</author>
<author>Jeff Dean</author>
</authors>
<title>Distributed representations of words and phrases and their compositionality.</title>
<date>2013</date>
<booktitle>In Advances in Neural Information Processing Systems,</booktitle>
<pages>3111--3119</pages>
<contexts>
<context position="8295" citStr="Mikolov et al., 2013" startWordPosition="1352" endWordPosition="1355">ng overlapping phrases with a greedy left-toright walk through the tweet. Each word then generates a set of features indicating which gazetteers (if any) contain its phrase. For cluster representations, we train Brown clusters on our unannotated corpus, using the implementation by Liang (2005) to build 1,000 clusters over types that occur with a minimum frequency of 10. Following Miller et al. (2004), each word generates indicators for bit prefixes of its binary cluster signature, for prefixes of length 2, 4 8 and 12. For word embeddings, we use an in-house Java re-implementation of word2vec (Mikolov et al., 2013a) to build 300- dimensional vector representations for all types that occur at least 10 times in our unannotated corpus. Each word then reports a real-valued feature (as opposed to an indicator) for each of the 300 55 Team O Person O O O O Other O O Ducks sign LW Beleskey to 2-year extension - San Jose Mercury News http://dlvr.it/5RcvP #ANADucks Figure 1: An example of semi-Markov tagging. dimensions in its vector representation. A single random vector is created to represent all out-ofvocabulary words. Our vectors and clusters cover 2.5 million types. Note that we do not include part-of-spee</context>
<context position="9934" citStr="Mikolov et al., 2013" startWordPosition="1631" endWordPosition="1634">icons for which we could infer Freebase categories were replaced with updated queries, indicated with new=no. We also added a number of entirely new queries (new=yes). Any baseline lexicon that is not mentioned in Table 1 was left untouched, and remains included in our updated gazetteers. 3.3 Phrase Embeddings The work of Passos et al. (2014) suggests that embeddings built over phrases may be more useful to NER than those built over words. To test this for our tagger, we use the phrase finding tool provided with word2vec to segment our unannotated corpus into phrases up to 4 tokens in length (Mikolov et al., 2013b). Their software uses a simple statistic similar to pointwise mutual information to assess whether two tokens should be combined into a phrase. Token-pairs passing a threshold are segmented into phrases, creating a new corpus. Phrases longer than 2 tokens can be generated by running this process repeatedly, allowing, for example, Toronto to merge with Maple Leafs on the second pass. Once the phrasal corpus has been created, we run word2vec as usual. There is no reason the same procedure could not be applied to Brown clustering, only time constraints prevented us from doing so. The resulting </context>
</contexts>
<marker>Mikolov, Sutskever, Chen, Corrado, Dean, 2013</marker>
<rawString>Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg S Corrado, and Jeff Dean. 2013b. Distributed representations of words and phrases and their compositionality. In Advances in Neural Information Processing Systems, pages 3111–3119.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Scott Miller</author>
<author>Jethran Guinness</author>
<author>Alex Zamanian</author>
</authors>
<title>Name tagging with word clusters and discriminative training.</title>
<date>2004</date>
<booktitle>In HLT-NAACL,</booktitle>
<pages>337--342</pages>
<contexts>
<context position="8078" citStr="Miller et al. (2004)" startWordPosition="1314" endWordPosition="1317">nding to a number of external representations: gazetteer membership, Brown clusters (Brown et al., 1992) and word embeddings. For gazetteers, we first segment the tweet into longest matching gazetteer phrases, resolving overlapping phrases with a greedy left-toright walk through the tweet. Each word then generates a set of features indicating which gazetteers (if any) contain its phrase. For cluster representations, we train Brown clusters on our unannotated corpus, using the implementation by Liang (2005) to build 1,000 clusters over types that occur with a minimum frequency of 10. Following Miller et al. (2004), each word generates indicators for bit prefixes of its binary cluster signature, for prefixes of length 2, 4 8 and 12. For word embeddings, we use an in-house Java re-implementation of word2vec (Mikolov et al., 2013a) to build 300- dimensional vector representations for all types that occur at least 10 times in our unannotated corpus. Each word then reports a real-valued feature (as opposed to an indicator) for each of the 300 55 Team O Person O O O O Other O O Ducks sign LW Beleskey to 2-year extension - San Jose Mercury News http://dlvr.it/5RcvP #ANADucks Figure 1: An example of semi-Marko</context>
</contexts>
<marker>Miller, Guinness, Zamanian, 2004</marker>
<rawString>Scott Miller, Jethran Guinness, and Alex Zamanian. 2004. Name tagging with word clusters and discriminative training. In HLT-NAACL, pages 337–342.</rawString>
</citation>
<citation valid="true">
<authors>
<author>David Nadeau</author>
<author>Satoshi Sekine</author>
</authors>
<title>A survey of named entity recognition and classification.</title>
<date>2007</date>
<journal>Lingvisticae Investigationes,</journal>
<volume>30</volume>
<issue>1</issue>
<contexts>
<context position="1032" citStr="Nadeau and Sekine, 2007" startWordPosition="156" endWordPosition="159">enhance this approach with updated gazetteers, and with infused phrase embeddings that have been adapted to better predict the gazetteer membership of each phrase. Our system achieves a typed F1 of 44.7, resulting in a third-place finish, despite training only on the official training set. A post-competition analysis indicates that also training on the provided development data improves our performance to 54.2 F1. 1 Introduction Named entity recognition (NER) is the task of finding rigid designators as they appear in free text and assigning them to coarse types such as person or geo-location (Nadeau and Sekine, 2007). NER is the first step in many information extraction tasks, but in social media, this task is extremely challenging. The text to be analyzed is unedited and noisy, and covers a much more diverse set of topics than one might expect in newswire. As such, we are quite interested in the W-NUT Named Entity Recognition in Twitter task (Baldwin et al., 2015) as a platform to benchmark and drive forward work on NER in social media. Our submission to this competition closely follows Cherry and Guo (2015), who advocate the use of a semi-Markov tagger trained online with standard discriminative tagging</context>
</contexts>
<marker>Nadeau, Sekine, 2007</marker>
<rawString>David Nadeau and Satoshi Sekine. 2007. A survey of named entity recognition and classification. Lingvisticae Investigationes, 30(1):3–26.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Jiquan Ngiam</author>
<author>Aditya Khosla</author>
<author>Mingyu Kim</author>
<author>Juhan Nam</author>
<author>Honglak Lee</author>
<author>Andrew Y Ng</author>
</authors>
<title>Multimodal deep learning.</title>
<date>2011</date>
<booktitle>In ICML,</booktitle>
<pages>689--696</pages>
<contexts>
<context position="12420" citStr="Ngiam et al. (2011)" startWordPosition="2025" endWordPosition="2028">l., 2014). A similar strategy for enhancing word embeddings has also been demonstrated for sentiment analysis (Tang et al., 2014). Following this line of research, we aim to tailor (post-process) the unsupervised phrase embeddings, created in Section 3.3, for our NER task, using an auto-encoder. The auto-encoder eliminates the need to have access to the original training data and the vector training model, requiring only the trained distributed vectors. In this sense, it can be considered computationally lighter than the above mentioned information fusion methods.1 Our approach is inspired by Ngiam et al. (2011) and Glorot et al. (2011), where auto-encoders are efficiently deployed to generate improved features for domains or modalities that are different from those of its inputs. Here, we employ an auto-encoder to inject 1In practice, a number of popular sets of pretrained embedding vectors, trained with very large corpora, are made available online to the research community, but without the original training data; such as some sets from http://nlp.stanford.edu/projects/glove/ and https://code.google.com/p/word2vec/ 56 Gazetteer # new architecture.museum 9k no architecture.structure 111k yes broadca</context>
</contexts>
<marker>Ngiam, Khosla, Kim, Nam, Lee, Ng, 2011</marker>
<rawString>Jiquan Ngiam, Aditya Khosla, Mingyu Kim, Juhan Nam, Honglak Lee, and Andrew Y. Ng. 2011. Multimodal deep learning. In ICML, pages 689–696.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Alexandre Passos</author>
<author>Vineet Kumar</author>
<author>Andrew McCallum</author>
</authors>
<title>Lexicon infused phrase embeddings for named entity resolution. In CoNLL,</title>
<date>2014</date>
<pages>78--86</pages>
<contexts>
<context position="9658" citStr="Passos et al. (2014)" startWordPosition="1581" endWordPosition="1584">tures involving gazetteers were having a larger impact on dev than on dev 2015. Therefore, we enhanced the gazetteers provided with the W-NUT baseline system with our own Freebase queries, issued between May 6–7, 2015. These updates are summarized in Table 1. The baseline lexicons for which we could infer Freebase categories were replaced with updated queries, indicated with new=no. We also added a number of entirely new queries (new=yes). Any baseline lexicon that is not mentioned in Table 1 was left untouched, and remains included in our updated gazetteers. 3.3 Phrase Embeddings The work of Passos et al. (2014) suggests that embeddings built over phrases may be more useful to NER than those built over words. To test this for our tagger, we use the phrase finding tool provided with word2vec to segment our unannotated corpus into phrases up to 4 tokens in length (Mikolov et al., 2013b). Their software uses a simple statistic similar to pointwise mutual information to assess whether two tokens should be combined into a phrase. Token-pairs passing a threshold are segmented into phrases, creating a new corpus. Phrases longer than 2 tokens can be generated by running this process repeatedly, allowing, for</context>
<context position="11810" citStr="Passos et al., 2014" startWordPosition="1928" endWordPosition="1931">rs, which we discuss next. 3.4 Gazetteer-Infused Phrase Vectors We employ an auto-encoder to leverage knowledge derived from domain-specific gazetteers to make the distributed phrase representations more relevant to our NER task. In recent years, two sources of information have been found to be valuable to boost the performance for NER: distributed representation learned from a large corpus and domain-specific lexicons (Turian et al., 2010; Cherry and Guo, 2015). Research has also shown that merging these two forms of information can result in further predictive improvement for an NER system (Passos et al., 2014). A similar strategy for enhancing word embeddings has also been demonstrated for sentiment analysis (Tang et al., 2014). Following this line of research, we aim to tailor (post-process) the unsupervised phrase embeddings, created in Section 3.3, for our NER task, using an auto-encoder. The auto-encoder eliminates the need to have access to the original training data and the vector training model, requiring only the trained distributed vectors. In this sense, it can be considered computationally lighter than the above mentioned information fusion methods.1 Our approach is inspired by Ngiam et </context>
</contexts>
<marker>Passos, Kumar, McCallum, 2014</marker>
<rawString>Alexandre Passos, Vineet Kumar, and Andrew McCallum. 2014. Lexicon infused phrase embeddings for named entity resolution. In CoNLL, pages 78–86.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Adwait Ratnaparkhi</author>
</authors>
<title>A maximum entropy model for part-of-speech tagging. In</title>
<date>1996</date>
<booktitle>EMNLP,</booktitle>
<pages>133--142</pages>
<contexts>
<context position="6580" citStr="Ratnaparkhi (1996)" startWordPosition="1086" endWordPosition="1087">el both with a fixed number of epochs E through the data, and using PA’s regularization term C, which is similar to that of an SVM. We also have the capacity to deploy example-specific C-parameters, allowing us to assign some examples more weight during training, which we use only in post-competition analysis. Lexical Features: Recall that our semi-Markov model allows for both word and phrase-level features. The vast majority of our features are wordlevel, with the representation for a phrase being the sum of the features of its words. Our wordlevel features closely follow the set proposed by Ratnaparkhi (1996), covering word identity, the identities of surrounding words within a window of 2 tokens, and prefixes and suffixes up to three characters in length. Each word identity feature has three variants, with the first reporting the original word, the second reporting a lowercased version, and the third reporting a summary of the word’s shape (“Mrs.” becomes “Aa.”) All wordlevel features also have a variant that appends the word’s Begin/Inside/Last/Unique position within its entity. Our phrase-level features report phrase identity, with lowercased and word shape variants, along with a bias feature t</context>
</contexts>
<marker>Ratnaparkhi, 1996</marker>
<rawString>Adwait Ratnaparkhi. 1996. A maximum entropy model for part-of-speech tagging. In EMNLP, pages 133–142.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Alan Ritter</author>
<author>Sam Clark</author>
<author>Mausam</author>
<author>Oren Etzioni</author>
</authors>
<title>Named entity recognition in tweets: An experimental study.</title>
<date>2011</date>
<booktitle>In EMNLP,</booktitle>
<pages>1524--1534</pages>
<location>Edinburgh, Scotland, UK.</location>
<contexts>
<context position="3979" citStr="Ritter et al. (2011)" startWordPosition="645" endWordPosition="648">presentations (see Section 3.1). Our unannotated corpus collects 98M tweets (1,995M tokens) from between May 2011 and April 2012. The same corpus is used by Cherry and Guo (2015). These tweets have been tokenized and post-processed to remove many special Unicode 54 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 54–60, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics characters; they closely resemble those that appear in the provided training and development sets. Furthermore, the corpus consists only of tweets in which the NER system of Ritter et al. (2011) detects at least one entity. The automatic NER tags are used only to select tweets for inclusion in the corpus, after which the annotations are discarded. Filtering our tweets in this way has two immediate effects: first, each tweet is very likely to contain an entity mention. Second, the tweets are very long, with an average of 20.4 tokens per Tweet. As the test data is drawn from the winter of 2014–2015, we attempted to augment our corpus with more recent data: 13M unannotated English tweets drawn from Twitter’s public stream, from between April 24 and May 6, 2015. As we had very little rec</context>
</contexts>
<marker>Ritter, Clark, Mausam, Etzioni, 2011</marker>
<rawString>Alan Ritter, Sam Clark, Mausam, and Oren Etzioni. 2011. Named entity recognition in tweets: An experimental study. In EMNLP, pages 1524–1534, Edinburgh, Scotland, UK.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Sunita Sarawagi</author>
<author>William W Cohen</author>
</authors>
<title>Semimarkov conditional random fields for information extraction.</title>
<date>2004</date>
<booktitle>In NIPS,</booktitle>
<pages>1185--1192</pages>
<contexts>
<context position="5150" citStr="Sarawagi and Cohen, 2004" startWordPosition="854" endWordPosition="857">en April 24 and May 6, 2015. As we had very little recent data, we made no attempt to bias the corpus to be entity-rich. This corpus of recent tweets has an average tweet length of only 13.8 tokens. Our attempts to use this data to build word representations did not improve NER performance on the 2015 development set, regardless of whether we used the data on its own or in combination with our larger corpus. 3 Methods 3.1 Base Tagger We first summarize the approach of Cherry and Guo (2015), which we build upon for our system. Tagger: We tag each tweet independently using a semi-Markov tagger (Sarawagi and Cohen, 2004), which tags phrasal entities using a single operation, as opposed to traditional word-based entity tagging schemes. An example tag sequence, drawn from the 2010 development data, is shown in Figure 1. Semi-Markov tagging gives us the freedom to design features at either the phrase or the word level, while also simplifying our tag set. Furthermore, with our semi-Markov tags, we find we have no need for Markov features that track previous tag assignments, as our entity labels cohere naturally. This speeds up tagging dramatically. Semi-Markov tagging also introduces a hyper-parameter P, the maxi</context>
</contexts>
<marker>Sarawagi, Cohen, 2004</marker>
<rawString>Sunita Sarawagi and William W Cohen. 2004. Semimarkov conditional random fields for information extraction. In NIPS, pages 1185–1192.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Duyu Tang</author>
<author>Furu Wei</author>
<author>Nan Yang</author>
<author>Ming Zhou</author>
<author>Ting Liu</author>
<author>Bing Qin</author>
</authors>
<title>Learning sentimentspecific word embedding for twitter sentiment classification.</title>
<date>2014</date>
<booktitle>In ACL,</booktitle>
<pages>1555--1565</pages>
<contexts>
<context position="11930" citStr="Tang et al., 2014" startWordPosition="1947" endWordPosition="1950">om domain-specific gazetteers to make the distributed phrase representations more relevant to our NER task. In recent years, two sources of information have been found to be valuable to boost the performance for NER: distributed representation learned from a large corpus and domain-specific lexicons (Turian et al., 2010; Cherry and Guo, 2015). Research has also shown that merging these two forms of information can result in further predictive improvement for an NER system (Passos et al., 2014). A similar strategy for enhancing word embeddings has also been demonstrated for sentiment analysis (Tang et al., 2014). Following this line of research, we aim to tailor (post-process) the unsupervised phrase embeddings, created in Section 3.3, for our NER task, using an auto-encoder. The auto-encoder eliminates the need to have access to the original training data and the vector training model, requiring only the trained distributed vectors. In this sense, it can be considered computationally lighter than the above mentioned information fusion methods.1 Our approach is inspired by Ngiam et al. (2011) and Glorot et al. (2011), where auto-encoders are efficiently deployed to generate improved features for doma</context>
</contexts>
<marker>Tang, Wei, Yang, Zhou, Liu, Qin, 2014</marker>
<rawString>Duyu Tang, Furu Wei, Nan Yang, Ming Zhou, Ting Liu, and Bing Qin. 2014. Learning sentimentspecific word embedding for twitter sentiment classification. In ACL, pages 1555–1565.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Joseph Turian</author>
<author>Lev Ratinov</author>
<author>Yoshua Bengio</author>
</authors>
<title>Word representations: a simple and general method for semi-supervised learning.</title>
<date>2010</date>
<booktitle>In ACL,</booktitle>
<pages>384--394</pages>
<contexts>
<context position="11633" citStr="Turian et al., 2010" startWordPosition="1899" endWordPosition="1902">on their context. Otherwise, the embedding features are identical to those described in Section 3.1. Phrase embeddings enable more gazetteer matches for gazetteer-infused vectors, which we discuss next. 3.4 Gazetteer-Infused Phrase Vectors We employ an auto-encoder to leverage knowledge derived from domain-specific gazetteers to make the distributed phrase representations more relevant to our NER task. In recent years, two sources of information have been found to be valuable to boost the performance for NER: distributed representation learned from a large corpus and domain-specific lexicons (Turian et al., 2010; Cherry and Guo, 2015). Research has also shown that merging these two forms of information can result in further predictive improvement for an NER system (Passos et al., 2014). A similar strategy for enhancing word embeddings has also been demonstrated for sentiment analysis (Tang et al., 2014). Following this line of research, we aim to tailor (post-process) the unsupervised phrase embeddings, created in Section 3.3, for our NER task, using an auto-encoder. The auto-encoder eliminates the need to have access to the original training data and the vector training model, requiring only the tra</context>
</contexts>
<marker>Turian, Ratinov, Bengio, 2010</marker>
<rawString>Joseph Turian, Lev Ratinov, and Yoshua Bengio. 2010. Word representations: a simple and general method for semi-supervised learning. In ACL, pages 384– 394.</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>