<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.046165">
<title confidence="0.996269">
Hallym: Named Entity Recognition on Twitter with Induced Word
Representation
</title>
<author confidence="0.998594">
Eun-Suk Yang Yu-Seop Kim
</author>
<affiliation confidence="0.999245">
Hallym University Hallym University
</affiliation>
<email confidence="0.990652">
esyang219@gmail.com yskim01@hallym.ac.kr
</email>
<sectionHeader confidence="0.993659" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.999971705882353">
Twitter is a type of social media that con-
tains diverse user-generated texts. Tradi-
tional models are not applicable to tweet
data because the text style is not as gram-
maticalized as that of newswire. In this
paper, we construct word embeddings via
canonical correlation analysis (CCA) on
a considerable amount of tweet data and
show the efficacy of word representation.
Besides word embedding, we use part-
of-speech (POS) tags, chunks, and brown
clusters induced from Wikipedia as fea-
tures. Here, we describe our system and
present the final results along with their
analysis. Our model achieves an F1 score
of 37.21% with entity types and distin-
guishes 53.01% of the entity boundaries.
</bodyText>
<sectionHeader confidence="0.99899" genericHeader="keywords">
1 Introduction
</sectionHeader>
<bodyText confidence="0.997727684210526">
Named entity recognition (NER) is a task of find-
ing and classifying names of things, such as per-
son, location, and organization, given a sequence
of words. NER is a very important subtask of in-
formation extraction (IE).
With the development of the Internet, a huge
amount of information has been generated by
users. The information generated on the Inter-
net, particularly on social media (e.g., Twitter and
Facebook), includes very diverse and noisy texts.
The volume of Twitter data has increased rapidly,
and about 500 million tweets are sent per day1.
In recent years, Twitter data have considered a
new source in nature and researchers are paying
increased attention to them (Bollen et al., 2011;
Mathioudakis and Koudas, 2010).
Twitter is a type of microblogging service in which
users are allowed to post contents such as small
messages, individual images, or videos. There
</bodyText>
<footnote confidence="0.969787">
1See “http://www.internetlivestats.com/twitter-statistics/”
</footnote>
<bodyText confidence="0.999826">
are a number of microblogging sites such as Twit-
ter, Tumblr, Plurk and identi.ca. Each service has
its own characteristics. For example, Plurk has a
timeline view for videos and pictures, and Twitter
has “status updates.”
The characteristic of “status updates” is one of the
features that makes the classification of named en-
tities in Twitter difficult. In Twitter, there is a limit
for the number of characters that people can post
at once. People post their thoughts with a short
sentence; this leads to the problem that tweets do
not contain sufficient contextual information (Rit-
ter et al., 2011).
The shared task of ACL W-NUT 2015 is to find
named entities on Twitter. Here, we will fo-
cus on ten types of named entities: company, fa-
cility, geo-loc, movie, musicartist, other, person,
product, sportsteam, and tvshow. We have the
training and development data for Twitter and 53
gazetteers from the abovementioned shared task.
In this paper, we describe the datasets in Section 2
and present the model that we use in this study in
Section 3. In Section 4, we discuss the features
used and the methods used for generating these
features. We present our final results along with
their analysis in Section 5 and conclude this paper
in Section 6.
</bodyText>
<sectionHeader confidence="0.894305" genericHeader="introduction">
2 Data and Labels
</sectionHeader>
<bodyText confidence="0.999676">
In this section, we introduce the considered
datasets and describe the data format used. We
also list the characteristics of each entity type with
some examples.
</bodyText>
<subsectionHeader confidence="0.97184">
2.1 Data
</subsectionHeader>
<bodyText confidence="0.998626">
The datasets provided by shared task are raw
tweets. Table 1 shows an overview of the sizes of
these datasets. In a tweet, each line contains words
and its label is separated by a tab and a blank line
that forms a sentence boundary. All tokens follow
the IOB format. The token with a B-prefix indi-
</bodyText>
<page confidence="0.985585">
72
</page>
<note confidence="0.807957">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 72–77,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<bodyText confidence="0.562351">
cates the beginning of a named entity and the to-
ken with an I-prefix indicates the inside of a named
entity. An I-prefix only follows after a token with
a B-prefix. An O tag indicates that a token does
not belong to a specific named entity.
</bodyText>
<table confidence="0.975494333333333">
Data Tweets Tokens
train 1,795 37,899
test 1,000 16,261
</table>
<tableCaption confidence="0.999884">
Table 1: An overview of datasets.
</tableCaption>
<subsectionHeader confidence="0.998032">
2.2 Labels
</subsectionHeader>
<bodyText confidence="0.995183851851852">
In the system, we focus on the following ten types
of named entities:
company The name of a company or a brand
e.g., Snapchat, Twitter, and Facebook
facility The name of an institution such as a mu-
seum, a center, or a restaurant
e.g., Iowa City schools and Disneyland
geo-loc The name of a city or country
e.g., Chicago and Russia
movie The title of a movie
e.g., Interstellar and Inception
musicartist The name of music groups or disc
jockeys (DJs)
e.g., Taylor Swift and Lady Gaga
other A phrase that can be used generally such as
the name of a ceremony or an anniversary, or
the title of a song
e.g., X-mas and Murphy’s law
person The name of a person; it can be the per-
son’s full name, last name, or first name
e.g., Steve King and Ellen
product The name of a product
e.g., Nokia 5800 and Coke
sportsteam The name of a sports team
e.g., Arsenal and West Ham
tvshow The title of a television (TV) show
e.g., The Persuaders and Pretty Little Liars
</bodyText>
<sectionHeader confidence="0.988431" genericHeader="method">
3 Model
</sectionHeader>
<bodyText confidence="0.967557">
Conditional Random Fields (CRFs) (Lafferty et
al., 2001) and its variants have been successfully
applied to various sequence labeling tasks (Maaten
et al., 2011; Collins, 2002; McCallum and Li,
2003; Kim and Snyder, 2012; Kim et al., 2015b;
Kim et al., 2015a; Kim and Snyder, 2013a; Kim
and Snyder, 2013b). The NER task produces a
sequence of named entity tags, y = (y1 ... yn),
given a sequence of words, x = (x1 ... xn). We
model the conditional probability p(y|x; 0) using
linear-chain CRFs:
</bodyText>
<equation confidence="0.999553333333333">
exp(0 · Φ(x, y))
p(y|x; 0) =
EyIEY(x) exp(0 · Φ(x, y&apos;))
</equation>
<bodyText confidence="0.967940222222222">
where 0 denotes a set of model parameters. Y
returns all possible label sequences of x, and Φ
maps (x, y) into a feature vector that is a lin-
ear sum of the local feature vectors: Φ(x, y) =
En j=1 O(x, j, yj−1, yj). Given the fully labeled
sequences {(x(i), y(i))}Ni=1, the objective of the
training is to find 0 that maximizes the log like-
lihood of the training data under the model with
l2-regularization:
</bodyText>
<equation confidence="0.952505">
log p(y(i)|x(i); 0)
� λ2 ||0||2 .
</equation>
<sectionHeader confidence="0.998302" genericHeader="method">
4 Features
</sectionHeader>
<bodyText confidence="0.999884444444444">
In this section, we describe a variety of features
that we have used in this study. We also used
CRFsuite2 because it makes the application of new
features easy. Apart from the base features and
gazetteer features provided by the organizers, we
have used the following new features: POS tags,
chunks, brown clustering, and word representa-
tion. Our model is composed of the following fea-
tures:
</bodyText>
<subsectionHeader confidence="0.999335">
4.1 Base features
</subsectionHeader>
<bodyText confidence="0.999970142857143">
Base features include the gazetteer features and
orthographic features. In the NER task, a huge
amount of unlabeled data is often used for iden-
tifying unseen entities. There are already 53
gazetteers in the baseline system. The maximum
window size for gazetteer features is 6, and the
model will learn the named entity type associated
</bodyText>
<equation confidence="0.9245492">
2http://www.chokkan.org/software/crfsuite/
0∗ = argmax
θ
N
i=1
</equation>
<page confidence="0.965081">
73
</page>
<bodyText confidence="0.99986">
with a specific phrase, if it is in one or more of the
gazetteer lexicons. Orthographic features can be
divided into five types. The orthographic feature
templates are as follows:
</bodyText>
<listItem confidence="0.9965662">
• n-gram: wi for i in {-1,0,1}, conjunction of
previous word and current word wi−1|wi for
i in {-1,0}.
• Affixes: Prefixes and suffixes of xi. The first
and last n characters ranging from 1 to 3.
• Capitalization: There are two patterns of cap-
italization: One is an indicator of capitaliza-
tion for the first character, and the other is an
indicator of capitalization for all characters.
• Digit: There are three patterns for numbers:
i) Whether the current word has a digit, ii)
whether the current word is a single digit, and
iii) whether the current word has two digits.
• Non-alphabet: Whether the current word
contains a hyphen and other punctuation
marks. Among the other punctuation marks
is the colon(:). In general, what follows
right after a colon mark represents a feature
weight. To make the model learn correctly,
we normalize only the colon mark.
</listItem>
<subsectionHeader confidence="0.99161">
4.2 POS tags and chunks
</subsectionHeader>
<bodyText confidence="0.997460363636364">
In the NER task, POS tags and chunks contain
very useful information for finding and classifying
named entities. We predict POS tags and chunks
by using a model trained with Twitter data. For
POS tags, we use a model trained with the Penn
Treebank-style tagset (Ritter et al., 2011). In a
model, some Twitter-specific tags are added by
Ritter et al. (2011): retweets, @usernames, #hash-
tags, and urls. For chunks, we use a named entity
tagger3 by Ritter et al. (2012). Predicted tags are
used as features as follows:
</bodyText>
<listItem confidence="0.99818">
• POS tag: a conjunction feature with the cur-
rent word and the current POS tag w0|p0.
• Chunk tag: a unigram feature for chunk tag
c0 and a conjunction feature with the current
word and the current chunk tag w0|c0.
</listItem>
<footnote confidence="0.982369">
3https://github.com/aritter/twitter nlp
</footnote>
<subsectionHeader confidence="0.993883">
4.3 Brown clustering
</subsectionHeader>
<bodyText confidence="0.999963833333333">
Brown clustering is a hierarchical clustering
method that groups words into a binary tree of
classes (Brown et al., 1992). We downloaded a
brown clustering4 based on Wikipedia provided by
Turian et al. (2010). We used whole bit string of
the current word.
</bodyText>
<subsectionHeader confidence="0.997677">
4.4 Word representation
</subsectionHeader>
<bodyText confidence="0.999997318181818">
As a new source, tweet data are not applicable to
the traditional model because of the different text
structure. For a new model, it is natural to use
annotated data. However, it is difficult to create
new labeled data for a rapid generation of tweets.
Instead of constantly annotate new data, the gen-
eral solution is creating induced word representa-
tions from a large body of unlabeled data (Mikolov
et al., 2013; Pennington et al., 2014; Kim et al.,
2014; Anastasakos et al., 2014). A lot of previ-
ous work have used CCA because of its simplic-
ity and generality (Kim et al., 2015c; Kim et al.,
2015d; Stratos et al., 2014; Kim et al., 2015b). We
create a word representation by using the canon-
ical correlation analysis (Hotelling, 1936). Fur-
thermore, word embeddings are induced from 13
million tweets containing 270 million tokens. The
dimension of word embeddings we used is 50 with
words occurring more than twice in the data. The
window size for the contextual information is 3:
the current word and a word to the left and the
right of the current word.
</bodyText>
<sectionHeader confidence="0.99998" genericHeader="evaluation">
5 Results
</sectionHeader>
<subsectionHeader confidence="0.997263">
5.1 Error analysis
</subsectionHeader>
<bodyText confidence="0.999963266666667">
Twitter contains noisy and informal style text, and
most of the state-of-art applications show a weak
performance on Twitter data (Ritter et al., 2011).
In this section, we check the errors for noisy text
from the baseline system and categorize them. The
last two errors are related to user-generated texts
such as Twitter data.
Unseen word sequences: The main cause of this
error is in a previously unseen sequence. A
huge number of tweets are posted on Twit-
ter every day and they contain up-to-date in-
formation on events. The most recent infor-
mation such as new product information can
lead to the formation of unprecedented word
sequences. These sequences do not appear in
</bodyText>
<footnote confidence="0.969602">
4http://metaoptimize.com/projects/wordreprs/
</footnote>
<page confidence="0.996714">
74
</page>
<table confidence="0.999687846153846">
MnoEmbedding MEmbedding +/-
Type P R F1 P R F1
Overall 35.95 31.92 33.81 39.59 35.10 37.21 +
company 27.59 20.51 23.53 32.14 23.08 26.87 +
facility 24.14 18.42 20.90 32.00 21.05 25.40 +
geo-loc 42.66 52.59 47.10 46.00 59.48 51.88 +
movie 14.29 6.67 9.09 8.33 6.67 7.41 -
musicartist 0.00 0.00 0.00 7.69 2.44 3.70 +
other 18.33 16.67 17.46 20.49 18.94 19.69 +
person 53.27 61.99 57.30 56.99 64.33 60.44 +
product 3.57 2.70 3.08 14.29 8.11 10.34 +
sportsteam 62.50 7.14 12.82 54.55 8.57 14.81 +
tvshow 0.00 0.00 0.00 0.00 0.00 0.00 .
</table>
<tableCaption confidence="0.962709">
Table 2: Results for model with and without word embedding. MnoEmbedding and MEmbedding represent
</tableCaption>
<bodyText confidence="0.999588109090909">
the model with and without word embedding, respectively. The rightmost column shows the decrease or
increase in the F1 score with respect to the model without word embedding. MEmbedding denotes our
final model.
the training data and gazetteers, and thus, the
model cannot learn them.
Foreign languages: This error is caused by
tweets written in languages other than En-
glish. Words written in foreign languages
are annotated by the O tag and not include a
named entity. However, some words have the
same spelling as an English word and thus,
activate the gazetteer features. This problem
leads to words with the O tag being predicted
as a named entity type.
Type disambiguation: There are some words
that have the same spelling but belong to dif-
ferent types according to the contextual in-
formation. This error is often observed for
named entities such as sportsteam and musi-
cartist. The word sequences with this error
have a correctly distinguished entity bound-
ary but predict the wrong entity type. For ex-
ample, Tampa Bay in “Losing to the Penguins
quasi-AHL lineup in December is a non-issue
for Tampa Bay” is an entity for sportsteam,
but the model classifies it as geo-loc instead
of sportsteam. In another example, the names
of two music artists in “Will Shawn Mendez
be opening up for Taylor Swift” are predicted
as person and not as musicartist.
Informal name or abbreviations: Twitter users
compress what they want to say to meet the
limit of 140 characters. This leads to in-
formal texts unlike in news articles. Note
that abbreviations do not indicate official full
forms such as airports or countries. For ex-
ample, Southie in “Proud that the 1st modern
Olympic Champion is James Brendan Con-
nolly of #Southie .” is an informal name of
South Boston, and this word does not appear
in the training set and gazetteers. With re-
spect to abbreviations, people use abbreviat-
tions for indicating a day or a month, such as
Mon for Monday and Jan for January. These
words are contained in gazetteers and activate
the gazetteer features. A model makes errors
by predicting them as named entities.
Hashtag: A hashtag is a combination of the “#”
sign and some characters for organizing word
sequences as searchable links in Twitter. The
rule is to not use any space between the char-
acters in the hashtag. For instance, the word
New Delhi is transformed into #NewDelhi
as a hashtag, so it is difficult to check the
gazetteer lexicons for such text.
</bodyText>
<subsectionHeader confidence="0.998647">
5.2 The effectiveness of word embedding
</subsectionHeader>
<bodyText confidence="0.999815888888889">
In this subsection, we describe the effectiveness
of word embedding by analyzing the results
obtained by using the model with and without
word embedding. The only difference between
both the models is the use of brown clustering and
the word representation based on CCA.
In the NER task, the F1 score is a more appro-
priate metric than accuracy. Most of the labels in
the NER data contain the O tag, indicating that
</bodyText>
<page confidence="0.997256">
75
</page>
<bodyText confidence="0.999642848484848">
they are not an entity. Since this leads to high
accuracy, by using the F1 score, we obtain a more
reasonable harmonic function of the precision and
the recall.
Table 2 shows the results obtained by using
models with and without word embedding. As
shown in table 2, brown clustering and word
embedding have a good effect on performance.
All types of entities except movie show error
reduction. For determining the efficacy of word
embedding, we compare the errors between the
models without word embedding and with word
embedding. We find that word embedding plays
an important role in resolving the problem of
unseen word sequences and the problem of type
disambiguation. First, the model without word
embedding does not learn about an entity ipad
Mini Retina 2nd Generation 16GB wifi because
some of the words do not appear in the training
data. In contrast, the model with embedding can
learn unseen words from the induced word repre-
sentation. This helps the model to predict that the
abovementioned entity indicates a product name.
The model without word embedding also has
the problem of disambiguation of a word Edison
because the model only learns that this word is a
person’s name from the gazetteers. However, in
the word sequence “Edison #weather on January
16 , 2015”, Edison indicates a town in New Jersey.
The model with word embedding is provided
additional information by the word embedding
process and predicts the abovementioned word as
geo-loc correctly.
</bodyText>
<sectionHeader confidence="0.999249" genericHeader="conclusions">
6 Conclusion
</sectionHeader>
<bodyText confidence="0.999917375">
In this paper, we described the data and features
used for generating our model. Besides POS tags
and chunk tags, we used a word representation
based on CCA for improving the model’s perfor-
mance. Our final model shows an error reduc-
tion of 14.08% from the baseline system. We also
presented some primary and Twitter-specific prob-
lems by categorizing errors.
</bodyText>
<sectionHeader confidence="0.998964" genericHeader="references">
References
</sectionHeader>
<reference confidence="0.998499821428571">
Tasos Anastasakos, Young-Bum Kim, and Anoop Deo-
ras. 2014. Task specific continuous word represen-
tations for mono and multi-lingual spoken language
understanding. In ICASSP, pages 3246–3250. IEEE.
Johan Bollen, Huina Mao, and Xiaojun Zeng. 2011.
Twitter mood predicts the stock market. Journal of
Computational Science, 2(1):1–8.
Peter F Brown, Peter V Desouza, Robert L Mercer,
Vincent J Della Pietra, and Jenifer C Lai. 1992.
Class-based n-gram models of natural language.
Computational linguistics, 18(4):467–479.
Michael Collins. 2002. Discriminative training meth-
ods for hidden markov models: Theory and exper-
iments with perceptron algorithms. In Proceedings
of the ACL-02 conference on Empirical methods in
natural language processing-Volume 10, pages 1–8.
Association for Computational Linguistics.
Harold Hotelling. 1936. Relations between two sets of
variates. Biometrika, pages 321–377.
Young-Bum Kim and Benjamin Snyder. 2012. Univer-
sal grapheme-to-phoneme prediction over latin al-
phabets. In EMNLP, pages 332–343. Association
for Computational Linguistics.
Young-Bum Kim and Benjamin Snyder. 2013a. Opti-
mal data set selection: An application to grapheme-
to-phoneme conversion. In HLT-NAACL, pages
1196–1205. Association for Computational Linguis-
tics.
Young-Bum Kim and Benjamin Snyder. 2013b. Unsu-
pervised consonant-vowel prediction over hundreds
of languages. In ACL (1), pages 1527–1536.
Young-Bum Kim, Heemoon Chae, Benjamin Snyder,
and Yu-Seop Kim. 2014. Training a korean srl
system with rich morphological features. In ACL,
pages 637–642. Association for Computational Lin-
guistics.
Young-Bum Kim, Minwoo Jeong, Karl Stratos, and
Ruhi Sarikaya. 2015a. Weakly supervised slot
tagging with partially labeled sequences from web
search click logs. In HLT-NAACL, pages 84–92. As-
sociation for Computational Linguistics.
Young-Bum Kim, Karl Stratos, Xiaohu Liu, and Ruhi
Sarikaya. 2015b. Compact lexicon selection with
spectral methods. In ACL. Association for Compu-
tational Linguistics.
Young-Bum Kim, Karl Stratos, and Ruhi Sarikaya.
2015c. Pre-training of hidden-unit crfs. In ACL.
Association for Computational Linguistics.
Young-Bum Kim, Karl Stratos, Ruhi Sarikaya, and
Minwoo Jeong. 2015d. New transfer learning tech-
niques for disparate label sets. In ACL. Association
for Computational Linguistics.
John Lafferty, Andrew McCallum, and Fernando CN
Pereira. 2001. Conditional random fields: Prob-
abilistic models for segmenting and labeling se-
quence data.
</reference>
<page confidence="0.844876">
76
</page>
<reference confidence="0.999397255813953">
Laurens Maaten, Max Welling, and Lawrence K Saul.
2011. Hidden-unit conditional random fields. In In-
ternational Conference on Artificial Intelligence and
Statistics.
Michael Mathioudakis and Nick Koudas. 2010. Twit-
termonitor: trend detection over the twitter stream.
In Proceedings of the 2010 ACM SIGMOD Inter-
national Conference on Management of data, pages
1155–1158. ACM.
Andrew McCallum and Wei Li. 2003. Early results
for named entity recognition with conditional ran-
dom fields, feature induction and web-enhanced lex-
icons. In HLT-NAACL, pages 188–191. Association
for Computational Linguistics.
Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg S Cor-
rado, and Jeff Dean. 2013. Distributed representa-
tions of words and phrases and their compositional-
ity. In Advances in neural information processing
systems, pages 3111–3119.
Jeffrey Pennington, Richard Socher, and Christopher D
Manning. 2014. Glove: Global vectors for
word representation. Proceedings of the Empiricial
Methods in Natural Language Processing (EMNLP
2014), 12:1532–1543.
Alan Ritter, Sam Clark, Oren Etzioni, et al. 2011.
Named entity recognition in tweets: an experimental
study. In Proceedings of the Conference on Empiri-
cal Methods in Natural Language Processing, pages
1524–1534. Association for Computational Linguis-
tics.
Alan Ritter, Mausam, Oren Etzioni, and Sam Clark.
2012. Open domain event extraction from twitter.
In KDD.
Karl Stratos, Do-kyum Kim, Michael Collins, and
Daniel Hsu. 2014. A spectral algorithm for learning
class-based n-gram models of natural language. In
UAI.
Joseph Turian, Lev Ratinov, and Yoshua Bengio. 2010.
Word representations: a simple and general method
for semi-supervised learning. In Proceedings of the
48th annual meeting of the association for compu-
tational linguistics, pages 384–394. Association for
Computational Linguistics.
</reference>
<page confidence="0.999124">
77
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.541743">
<title confidence="0.99916">Hallym: Named Entity Recognition on Twitter with Induced Representation</title>
<author confidence="0.998622">Eun-Suk Yang Yu-Seop Kim</author>
<affiliation confidence="0.99996">Hallym University Hallym University</affiliation>
<email confidence="0.585159">esyang219@gmail.comyskim01@hallym.ac.kr</email>
<abstract confidence="0.995967222222222">Twitter is a type of social media that contains diverse user-generated texts. Traditional models are not applicable to tweet data because the text style is not as grammaticalized as that of newswire. In this paper, we construct word embeddings via canonical correlation analysis (CCA) on a considerable amount of tweet data and show the efficacy of word representation. Besides word embedding, we use partof-speech (POS) tags, chunks, and brown clusters induced from Wikipedia as features. Here, we describe our system and present the final results along with their analysis. Our model achieves an F1 score of 37.21% with entity types and distinguishes 53.01% of the entity boundaries.</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="true">
<authors>
<author>Tasos Anastasakos</author>
<author>Young-Bum Kim</author>
<author>Anoop Deoras</author>
</authors>
<title>Task specific continuous word representations for mono and multi-lingual spoken language understanding.</title>
<date>2014</date>
<booktitle>In ICASSP,</booktitle>
<pages>3246--3250</pages>
<publisher>IEEE.</publisher>
<contexts>
<context position="9459" citStr="Anastasakos et al., 2014" startWordPosition="1606" endWordPosition="1609"> brown clustering4 based on Wikipedia provided by Turian et al. (2010). We used whole bit string of the current word. 4.4 Word representation As a new source, tweet data are not applicable to the traditional model because of the different text structure. For a new model, it is natural to use annotated data. However, it is difficult to create new labeled data for a rapid generation of tweets. Instead of constantly annotate new data, the general solution is creating induced word representations from a large body of unlabeled data (Mikolov et al., 2013; Pennington et al., 2014; Kim et al., 2014; Anastasakos et al., 2014). A lot of previous work have used CCA because of its simplicity and generality (Kim et al., 2015c; Kim et al., 2015d; Stratos et al., 2014; Kim et al., 2015b). We create a word representation by using the canonical correlation analysis (Hotelling, 1936). Furthermore, word embeddings are induced from 13 million tweets containing 270 million tokens. The dimension of word embeddings we used is 50 with words occurring more than twice in the data. The window size for the contextual information is 3: the current word and a word to the left and the right of the current word. 5 Results 5.1 Error anal</context>
</contexts>
<marker>Anastasakos, Kim, Deoras, 2014</marker>
<rawString>Tasos Anastasakos, Young-Bum Kim, and Anoop Deoras. 2014. Task specific continuous word representations for mono and multi-lingual spoken language understanding. In ICASSP, pages 3246–3250. IEEE.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Johan Bollen</author>
<author>Huina Mao</author>
<author>Xiaojun Zeng</author>
</authors>
<title>Twitter mood predicts the stock market.</title>
<date>2011</date>
<journal>Journal of Computational Science,</journal>
<volume>2</volume>
<issue>1</issue>
<contexts>
<context position="1588" citStr="Bollen et al., 2011" startWordPosition="248" endWordPosition="251">ngs, such as person, location, and organization, given a sequence of words. NER is a very important subtask of information extraction (IE). With the development of the Internet, a huge amount of information has been generated by users. The information generated on the Internet, particularly on social media (e.g., Twitter and Facebook), includes very diverse and noisy texts. The volume of Twitter data has increased rapidly, and about 500 million tweets are sent per day1. In recent years, Twitter data have considered a new source in nature and researchers are paying increased attention to them (Bollen et al., 2011; Mathioudakis and Koudas, 2010). Twitter is a type of microblogging service in which users are allowed to post contents such as small messages, individual images, or videos. There 1See “http://www.internetlivestats.com/twitter-statistics/” are a number of microblogging sites such as Twitter, Tumblr, Plurk and identi.ca. Each service has its own characteristics. For example, Plurk has a timeline view for videos and pictures, and Twitter has “status updates.” The characteristic of “status updates” is one of the features that makes the classification of named entities in Twitter difficult. In Tw</context>
</contexts>
<marker>Bollen, Mao, Zeng, 2011</marker>
<rawString>Johan Bollen, Huina Mao, and Xiaojun Zeng. 2011. Twitter mood predicts the stock market. Journal of Computational Science, 2(1):1–8.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Peter F Brown</author>
<author>Peter V Desouza</author>
<author>Robert L Mercer</author>
<author>Vincent J Della Pietra</author>
<author>Jenifer C Lai</author>
</authors>
<title>Class-based n-gram models of natural language.</title>
<date>1992</date>
<journal>Computational linguistics,</journal>
<volume>18</volume>
<issue>4</issue>
<contexts>
<context position="8817" citStr="Brown et al., 1992" startWordPosition="1496" endWordPosition="1499">, some Twitter-specific tags are added by Ritter et al. (2011): retweets, @usernames, #hashtags, and urls. For chunks, we use a named entity tagger3 by Ritter et al. (2012). Predicted tags are used as features as follows: • POS tag: a conjunction feature with the current word and the current POS tag w0|p0. • Chunk tag: a unigram feature for chunk tag c0 and a conjunction feature with the current word and the current chunk tag w0|c0. 3https://github.com/aritter/twitter nlp 4.3 Brown clustering Brown clustering is a hierarchical clustering method that groups words into a binary tree of classes (Brown et al., 1992). We downloaded a brown clustering4 based on Wikipedia provided by Turian et al. (2010). We used whole bit string of the current word. 4.4 Word representation As a new source, tweet data are not applicable to the traditional model because of the different text structure. For a new model, it is natural to use annotated data. However, it is difficult to create new labeled data for a rapid generation of tweets. Instead of constantly annotate new data, the general solution is creating induced word representations from a large body of unlabeled data (Mikolov et al., 2013; Pennington et al., 2014; K</context>
</contexts>
<marker>Brown, Desouza, Mercer, Pietra, Lai, 1992</marker>
<rawString>Peter F Brown, Peter V Desouza, Robert L Mercer, Vincent J Della Pietra, and Jenifer C Lai. 1992. Class-based n-gram models of natural language. Computational linguistics, 18(4):467–479.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Michael Collins</author>
</authors>
<title>Discriminative training methods for hidden markov models: Theory and experiments with perceptron algorithms.</title>
<date>2002</date>
<booktitle>In Proceedings of the ACL-02 conference on Empirical methods in natural language processing-Volume 10,</booktitle>
<pages>1--8</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="5200" citStr="Collins, 2002" startWordPosition="872" endWordPosition="873">ally such as the name of a ceremony or an anniversary, or the title of a song e.g., X-mas and Murphy’s law person The name of a person; it can be the person’s full name, last name, or first name e.g., Steve King and Ellen product The name of a product e.g., Nokia 5800 and Coke sportsteam The name of a sports team e.g., Arsenal and West Ham tvshow The title of a television (TV) show e.g., The Persuaders and Pretty Little Liars 3 Model Conditional Random Fields (CRFs) (Lafferty et al., 2001) and its variants have been successfully applied to various sequence labeling tasks (Maaten et al., 2011; Collins, 2002; McCallum and Li, 2003; Kim and Snyder, 2012; Kim et al., 2015b; Kim et al., 2015a; Kim and Snyder, 2013a; Kim and Snyder, 2013b). The NER task produces a sequence of named entity tags, y = (y1 ... yn), given a sequence of words, x = (x1 ... xn). We model the conditional probability p(y|x; 0) using linear-chain CRFs: exp(0 · Φ(x, y)) p(y|x; 0) = EyIEY(x) exp(0 · Φ(x, y&apos;)) where 0 denotes a set of model parameters. Y returns all possible label sequences of x, and Φ maps (x, y) into a feature vector that is a linear sum of the local feature vectors: Φ(x, y) = En j=1 O(x, j, yj−1, yj). Given the</context>
</contexts>
<marker>Collins, 2002</marker>
<rawString>Michael Collins. 2002. Discriminative training methods for hidden markov models: Theory and experiments with perceptron algorithms. In Proceedings of the ACL-02 conference on Empirical methods in natural language processing-Volume 10, pages 1–8. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Harold Hotelling</author>
</authors>
<title>Relations between two sets of variates.</title>
<date>1936</date>
<journal>Biometrika,</journal>
<pages>321--377</pages>
<contexts>
<context position="9713" citStr="Hotelling, 1936" startWordPosition="1654" endWordPosition="1655">w model, it is natural to use annotated data. However, it is difficult to create new labeled data for a rapid generation of tweets. Instead of constantly annotate new data, the general solution is creating induced word representations from a large body of unlabeled data (Mikolov et al., 2013; Pennington et al., 2014; Kim et al., 2014; Anastasakos et al., 2014). A lot of previous work have used CCA because of its simplicity and generality (Kim et al., 2015c; Kim et al., 2015d; Stratos et al., 2014; Kim et al., 2015b). We create a word representation by using the canonical correlation analysis (Hotelling, 1936). Furthermore, word embeddings are induced from 13 million tweets containing 270 million tokens. The dimension of word embeddings we used is 50 with words occurring more than twice in the data. The window size for the contextual information is 3: the current word and a word to the left and the right of the current word. 5 Results 5.1 Error analysis Twitter contains noisy and informal style text, and most of the state-of-art applications show a weak performance on Twitter data (Ritter et al., 2011). In this section, we check the errors for noisy text from the baseline system and categorize them</context>
</contexts>
<marker>Hotelling, 1936</marker>
<rawString>Harold Hotelling. 1936. Relations between two sets of variates. Biometrika, pages 321–377.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Young-Bum Kim</author>
<author>Benjamin Snyder</author>
</authors>
<title>Universal grapheme-to-phoneme prediction over latin alphabets. In</title>
<date>2012</date>
<booktitle>EMNLP,</booktitle>
<pages>332--343</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="5245" citStr="Kim and Snyder, 2012" startWordPosition="878" endWordPosition="881"> an anniversary, or the title of a song e.g., X-mas and Murphy’s law person The name of a person; it can be the person’s full name, last name, or first name e.g., Steve King and Ellen product The name of a product e.g., Nokia 5800 and Coke sportsteam The name of a sports team e.g., Arsenal and West Ham tvshow The title of a television (TV) show e.g., The Persuaders and Pretty Little Liars 3 Model Conditional Random Fields (CRFs) (Lafferty et al., 2001) and its variants have been successfully applied to various sequence labeling tasks (Maaten et al., 2011; Collins, 2002; McCallum and Li, 2003; Kim and Snyder, 2012; Kim et al., 2015b; Kim et al., 2015a; Kim and Snyder, 2013a; Kim and Snyder, 2013b). The NER task produces a sequence of named entity tags, y = (y1 ... yn), given a sequence of words, x = (x1 ... xn). We model the conditional probability p(y|x; 0) using linear-chain CRFs: exp(0 · Φ(x, y)) p(y|x; 0) = EyIEY(x) exp(0 · Φ(x, y&apos;)) where 0 denotes a set of model parameters. Y returns all possible label sequences of x, and Φ maps (x, y) into a feature vector that is a linear sum of the local feature vectors: Φ(x, y) = En j=1 O(x, j, yj−1, yj). Given the fully labeled sequences {(x(i), y(i))}Ni=1, </context>
</contexts>
<marker>Kim, Snyder, 2012</marker>
<rawString>Young-Bum Kim and Benjamin Snyder. 2012. Universal grapheme-to-phoneme prediction over latin alphabets. In EMNLP, pages 332–343. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Young-Bum Kim</author>
<author>Benjamin Snyder</author>
</authors>
<title>Optimal data set selection: An application to graphemeto-phoneme conversion.</title>
<date>2013</date>
<booktitle>In HLT-NAACL,</booktitle>
<pages>1196--1205</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="5305" citStr="Kim and Snyder, 2013" startWordPosition="890" endWordPosition="893">hy’s law person The name of a person; it can be the person’s full name, last name, or first name e.g., Steve King and Ellen product The name of a product e.g., Nokia 5800 and Coke sportsteam The name of a sports team e.g., Arsenal and West Ham tvshow The title of a television (TV) show e.g., The Persuaders and Pretty Little Liars 3 Model Conditional Random Fields (CRFs) (Lafferty et al., 2001) and its variants have been successfully applied to various sequence labeling tasks (Maaten et al., 2011; Collins, 2002; McCallum and Li, 2003; Kim and Snyder, 2012; Kim et al., 2015b; Kim et al., 2015a; Kim and Snyder, 2013a; Kim and Snyder, 2013b). The NER task produces a sequence of named entity tags, y = (y1 ... yn), given a sequence of words, x = (x1 ... xn). We model the conditional probability p(y|x; 0) using linear-chain CRFs: exp(0 · Φ(x, y)) p(y|x; 0) = EyIEY(x) exp(0 · Φ(x, y&apos;)) where 0 denotes a set of model parameters. Y returns all possible label sequences of x, and Φ maps (x, y) into a feature vector that is a linear sum of the local feature vectors: Φ(x, y) = En j=1 O(x, j, yj−1, yj). Given the fully labeled sequences {(x(i), y(i))}Ni=1, the objective of the training is to find 0 that maximizes th</context>
</contexts>
<marker>Kim, Snyder, 2013</marker>
<rawString>Young-Bum Kim and Benjamin Snyder. 2013a. Optimal data set selection: An application to graphemeto-phoneme conversion. In HLT-NAACL, pages 1196–1205. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Young-Bum Kim</author>
<author>Benjamin Snyder</author>
</authors>
<title>Unsupervised consonant-vowel prediction over hundreds of languages.</title>
<date>2013</date>
<journal>In ACL</journal>
<volume>1</volume>
<pages>1527--1536</pages>
<contexts>
<context position="5305" citStr="Kim and Snyder, 2013" startWordPosition="890" endWordPosition="893">hy’s law person The name of a person; it can be the person’s full name, last name, or first name e.g., Steve King and Ellen product The name of a product e.g., Nokia 5800 and Coke sportsteam The name of a sports team e.g., Arsenal and West Ham tvshow The title of a television (TV) show e.g., The Persuaders and Pretty Little Liars 3 Model Conditional Random Fields (CRFs) (Lafferty et al., 2001) and its variants have been successfully applied to various sequence labeling tasks (Maaten et al., 2011; Collins, 2002; McCallum and Li, 2003; Kim and Snyder, 2012; Kim et al., 2015b; Kim et al., 2015a; Kim and Snyder, 2013a; Kim and Snyder, 2013b). The NER task produces a sequence of named entity tags, y = (y1 ... yn), given a sequence of words, x = (x1 ... xn). We model the conditional probability p(y|x; 0) using linear-chain CRFs: exp(0 · Φ(x, y)) p(y|x; 0) = EyIEY(x) exp(0 · Φ(x, y&apos;)) where 0 denotes a set of model parameters. Y returns all possible label sequences of x, and Φ maps (x, y) into a feature vector that is a linear sum of the local feature vectors: Φ(x, y) = En j=1 O(x, j, yj−1, yj). Given the fully labeled sequences {(x(i), y(i))}Ni=1, the objective of the training is to find 0 that maximizes th</context>
</contexts>
<marker>Kim, Snyder, 2013</marker>
<rawString>Young-Bum Kim and Benjamin Snyder. 2013b. Unsupervised consonant-vowel prediction over hundreds of languages. In ACL (1), pages 1527–1536.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Young-Bum Kim</author>
<author>Heemoon Chae</author>
<author>Benjamin Snyder</author>
<author>Yu-Seop Kim</author>
</authors>
<title>Training a korean srl system with rich morphological features.</title>
<date>2014</date>
<booktitle>In ACL,</booktitle>
<pages>637--642</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="9432" citStr="Kim et al., 2014" startWordPosition="1602" endWordPosition="1605">). We downloaded a brown clustering4 based on Wikipedia provided by Turian et al. (2010). We used whole bit string of the current word. 4.4 Word representation As a new source, tweet data are not applicable to the traditional model because of the different text structure. For a new model, it is natural to use annotated data. However, it is difficult to create new labeled data for a rapid generation of tweets. Instead of constantly annotate new data, the general solution is creating induced word representations from a large body of unlabeled data (Mikolov et al., 2013; Pennington et al., 2014; Kim et al., 2014; Anastasakos et al., 2014). A lot of previous work have used CCA because of its simplicity and generality (Kim et al., 2015c; Kim et al., 2015d; Stratos et al., 2014; Kim et al., 2015b). We create a word representation by using the canonical correlation analysis (Hotelling, 1936). Furthermore, word embeddings are induced from 13 million tweets containing 270 million tokens. The dimension of word embeddings we used is 50 with words occurring more than twice in the data. The window size for the contextual information is 3: the current word and a word to the left and the right of the current wor</context>
</contexts>
<marker>Kim, Chae, Snyder, Kim, 2014</marker>
<rawString>Young-Bum Kim, Heemoon Chae, Benjamin Snyder, and Yu-Seop Kim. 2014. Training a korean srl system with rich morphological features. In ACL, pages 637–642. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Young-Bum Kim</author>
<author>Minwoo Jeong</author>
<author>Karl Stratos</author>
<author>Ruhi Sarikaya</author>
</authors>
<title>Weakly supervised slot tagging with partially labeled sequences from web search click logs.</title>
<date>2015</date>
<booktitle>In HLT-NAACL,</booktitle>
<pages>84--92</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="5263" citStr="Kim et al., 2015" startWordPosition="882" endWordPosition="885">e title of a song e.g., X-mas and Murphy’s law person The name of a person; it can be the person’s full name, last name, or first name e.g., Steve King and Ellen product The name of a product e.g., Nokia 5800 and Coke sportsteam The name of a sports team e.g., Arsenal and West Ham tvshow The title of a television (TV) show e.g., The Persuaders and Pretty Little Liars 3 Model Conditional Random Fields (CRFs) (Lafferty et al., 2001) and its variants have been successfully applied to various sequence labeling tasks (Maaten et al., 2011; Collins, 2002; McCallum and Li, 2003; Kim and Snyder, 2012; Kim et al., 2015b; Kim et al., 2015a; Kim and Snyder, 2013a; Kim and Snyder, 2013b). The NER task produces a sequence of named entity tags, y = (y1 ... yn), given a sequence of words, x = (x1 ... xn). We model the conditional probability p(y|x; 0) using linear-chain CRFs: exp(0 · Φ(x, y)) p(y|x; 0) = EyIEY(x) exp(0 · Φ(x, y&apos;)) where 0 denotes a set of model parameters. Y returns all possible label sequences of x, and Φ maps (x, y) into a feature vector that is a linear sum of the local feature vectors: Φ(x, y) = En j=1 O(x, j, yj−1, yj). Given the fully labeled sequences {(x(i), y(i))}Ni=1, the objective of t</context>
<context position="9556" citStr="Kim et al., 2015" startWordPosition="1626" endWordPosition="1629">rrent word. 4.4 Word representation As a new source, tweet data are not applicable to the traditional model because of the different text structure. For a new model, it is natural to use annotated data. However, it is difficult to create new labeled data for a rapid generation of tweets. Instead of constantly annotate new data, the general solution is creating induced word representations from a large body of unlabeled data (Mikolov et al., 2013; Pennington et al., 2014; Kim et al., 2014; Anastasakos et al., 2014). A lot of previous work have used CCA because of its simplicity and generality (Kim et al., 2015c; Kim et al., 2015d; Stratos et al., 2014; Kim et al., 2015b). We create a word representation by using the canonical correlation analysis (Hotelling, 1936). Furthermore, word embeddings are induced from 13 million tweets containing 270 million tokens. The dimension of word embeddings we used is 50 with words occurring more than twice in the data. The window size for the contextual information is 3: the current word and a word to the left and the right of the current word. 5 Results 5.1 Error analysis Twitter contains noisy and informal style text, and most of the state-of-art applications sh</context>
</contexts>
<marker>Kim, Jeong, Stratos, Sarikaya, 2015</marker>
<rawString>Young-Bum Kim, Minwoo Jeong, Karl Stratos, and Ruhi Sarikaya. 2015a. Weakly supervised slot tagging with partially labeled sequences from web search click logs. In HLT-NAACL, pages 84–92. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Young-Bum Kim</author>
<author>Karl Stratos</author>
<author>Xiaohu Liu</author>
<author>Ruhi Sarikaya</author>
</authors>
<title>Compact lexicon selection with spectral methods.</title>
<date>2015</date>
<booktitle>In ACL. Association for Computational Linguistics.</booktitle>
<contexts>
<context position="5263" citStr="Kim et al., 2015" startWordPosition="882" endWordPosition="885">e title of a song e.g., X-mas and Murphy’s law person The name of a person; it can be the person’s full name, last name, or first name e.g., Steve King and Ellen product The name of a product e.g., Nokia 5800 and Coke sportsteam The name of a sports team e.g., Arsenal and West Ham tvshow The title of a television (TV) show e.g., The Persuaders and Pretty Little Liars 3 Model Conditional Random Fields (CRFs) (Lafferty et al., 2001) and its variants have been successfully applied to various sequence labeling tasks (Maaten et al., 2011; Collins, 2002; McCallum and Li, 2003; Kim and Snyder, 2012; Kim et al., 2015b; Kim et al., 2015a; Kim and Snyder, 2013a; Kim and Snyder, 2013b). The NER task produces a sequence of named entity tags, y = (y1 ... yn), given a sequence of words, x = (x1 ... xn). We model the conditional probability p(y|x; 0) using linear-chain CRFs: exp(0 · Φ(x, y)) p(y|x; 0) = EyIEY(x) exp(0 · Φ(x, y&apos;)) where 0 denotes a set of model parameters. Y returns all possible label sequences of x, and Φ maps (x, y) into a feature vector that is a linear sum of the local feature vectors: Φ(x, y) = En j=1 O(x, j, yj−1, yj). Given the fully labeled sequences {(x(i), y(i))}Ni=1, the objective of t</context>
<context position="9556" citStr="Kim et al., 2015" startWordPosition="1626" endWordPosition="1629">rrent word. 4.4 Word representation As a new source, tweet data are not applicable to the traditional model because of the different text structure. For a new model, it is natural to use annotated data. However, it is difficult to create new labeled data for a rapid generation of tweets. Instead of constantly annotate new data, the general solution is creating induced word representations from a large body of unlabeled data (Mikolov et al., 2013; Pennington et al., 2014; Kim et al., 2014; Anastasakos et al., 2014). A lot of previous work have used CCA because of its simplicity and generality (Kim et al., 2015c; Kim et al., 2015d; Stratos et al., 2014; Kim et al., 2015b). We create a word representation by using the canonical correlation analysis (Hotelling, 1936). Furthermore, word embeddings are induced from 13 million tweets containing 270 million tokens. The dimension of word embeddings we used is 50 with words occurring more than twice in the data. The window size for the contextual information is 3: the current word and a word to the left and the right of the current word. 5 Results 5.1 Error analysis Twitter contains noisy and informal style text, and most of the state-of-art applications sh</context>
</contexts>
<marker>Kim, Stratos, Liu, Sarikaya, 2015</marker>
<rawString>Young-Bum Kim, Karl Stratos, Xiaohu Liu, and Ruhi Sarikaya. 2015b. Compact lexicon selection with spectral methods. In ACL. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Young-Bum Kim</author>
<author>Karl Stratos</author>
<author>Ruhi Sarikaya</author>
</authors>
<title>Pre-training of hidden-unit crfs.</title>
<date>2015</date>
<booktitle>In ACL. Association for Computational Linguistics.</booktitle>
<contexts>
<context position="5263" citStr="Kim et al., 2015" startWordPosition="882" endWordPosition="885">e title of a song e.g., X-mas and Murphy’s law person The name of a person; it can be the person’s full name, last name, or first name e.g., Steve King and Ellen product The name of a product e.g., Nokia 5800 and Coke sportsteam The name of a sports team e.g., Arsenal and West Ham tvshow The title of a television (TV) show e.g., The Persuaders and Pretty Little Liars 3 Model Conditional Random Fields (CRFs) (Lafferty et al., 2001) and its variants have been successfully applied to various sequence labeling tasks (Maaten et al., 2011; Collins, 2002; McCallum and Li, 2003; Kim and Snyder, 2012; Kim et al., 2015b; Kim et al., 2015a; Kim and Snyder, 2013a; Kim and Snyder, 2013b). The NER task produces a sequence of named entity tags, y = (y1 ... yn), given a sequence of words, x = (x1 ... xn). We model the conditional probability p(y|x; 0) using linear-chain CRFs: exp(0 · Φ(x, y)) p(y|x; 0) = EyIEY(x) exp(0 · Φ(x, y&apos;)) where 0 denotes a set of model parameters. Y returns all possible label sequences of x, and Φ maps (x, y) into a feature vector that is a linear sum of the local feature vectors: Φ(x, y) = En j=1 O(x, j, yj−1, yj). Given the fully labeled sequences {(x(i), y(i))}Ni=1, the objective of t</context>
<context position="9556" citStr="Kim et al., 2015" startWordPosition="1626" endWordPosition="1629">rrent word. 4.4 Word representation As a new source, tweet data are not applicable to the traditional model because of the different text structure. For a new model, it is natural to use annotated data. However, it is difficult to create new labeled data for a rapid generation of tweets. Instead of constantly annotate new data, the general solution is creating induced word representations from a large body of unlabeled data (Mikolov et al., 2013; Pennington et al., 2014; Kim et al., 2014; Anastasakos et al., 2014). A lot of previous work have used CCA because of its simplicity and generality (Kim et al., 2015c; Kim et al., 2015d; Stratos et al., 2014; Kim et al., 2015b). We create a word representation by using the canonical correlation analysis (Hotelling, 1936). Furthermore, word embeddings are induced from 13 million tweets containing 270 million tokens. The dimension of word embeddings we used is 50 with words occurring more than twice in the data. The window size for the contextual information is 3: the current word and a word to the left and the right of the current word. 5 Results 5.1 Error analysis Twitter contains noisy and informal style text, and most of the state-of-art applications sh</context>
</contexts>
<marker>Kim, Stratos, Sarikaya, 2015</marker>
<rawString>Young-Bum Kim, Karl Stratos, and Ruhi Sarikaya. 2015c. Pre-training of hidden-unit crfs. In ACL. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Young-Bum Kim</author>
<author>Karl Stratos</author>
<author>Ruhi Sarikaya</author>
<author>Minwoo Jeong</author>
</authors>
<title>New transfer learning techniques for disparate label sets.</title>
<date>2015</date>
<booktitle>In ACL. Association for Computational Linguistics.</booktitle>
<contexts>
<context position="5263" citStr="Kim et al., 2015" startWordPosition="882" endWordPosition="885">e title of a song e.g., X-mas and Murphy’s law person The name of a person; it can be the person’s full name, last name, or first name e.g., Steve King and Ellen product The name of a product e.g., Nokia 5800 and Coke sportsteam The name of a sports team e.g., Arsenal and West Ham tvshow The title of a television (TV) show e.g., The Persuaders and Pretty Little Liars 3 Model Conditional Random Fields (CRFs) (Lafferty et al., 2001) and its variants have been successfully applied to various sequence labeling tasks (Maaten et al., 2011; Collins, 2002; McCallum and Li, 2003; Kim and Snyder, 2012; Kim et al., 2015b; Kim et al., 2015a; Kim and Snyder, 2013a; Kim and Snyder, 2013b). The NER task produces a sequence of named entity tags, y = (y1 ... yn), given a sequence of words, x = (x1 ... xn). We model the conditional probability p(y|x; 0) using linear-chain CRFs: exp(0 · Φ(x, y)) p(y|x; 0) = EyIEY(x) exp(0 · Φ(x, y&apos;)) where 0 denotes a set of model parameters. Y returns all possible label sequences of x, and Φ maps (x, y) into a feature vector that is a linear sum of the local feature vectors: Φ(x, y) = En j=1 O(x, j, yj−1, yj). Given the fully labeled sequences {(x(i), y(i))}Ni=1, the objective of t</context>
<context position="9556" citStr="Kim et al., 2015" startWordPosition="1626" endWordPosition="1629">rrent word. 4.4 Word representation As a new source, tweet data are not applicable to the traditional model because of the different text structure. For a new model, it is natural to use annotated data. However, it is difficult to create new labeled data for a rapid generation of tweets. Instead of constantly annotate new data, the general solution is creating induced word representations from a large body of unlabeled data (Mikolov et al., 2013; Pennington et al., 2014; Kim et al., 2014; Anastasakos et al., 2014). A lot of previous work have used CCA because of its simplicity and generality (Kim et al., 2015c; Kim et al., 2015d; Stratos et al., 2014; Kim et al., 2015b). We create a word representation by using the canonical correlation analysis (Hotelling, 1936). Furthermore, word embeddings are induced from 13 million tweets containing 270 million tokens. The dimension of word embeddings we used is 50 with words occurring more than twice in the data. The window size for the contextual information is 3: the current word and a word to the left and the right of the current word. 5 Results 5.1 Error analysis Twitter contains noisy and informal style text, and most of the state-of-art applications sh</context>
</contexts>
<marker>Kim, Stratos, Sarikaya, Jeong, 2015</marker>
<rawString>Young-Bum Kim, Karl Stratos, Ruhi Sarikaya, and Minwoo Jeong. 2015d. New transfer learning techniques for disparate label sets. In ACL. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>John Lafferty</author>
<author>Andrew McCallum</author>
<author>Fernando CN Pereira</author>
</authors>
<title>Conditional random fields: Probabilistic models for segmenting and labeling sequence data.</title>
<date>2001</date>
<contexts>
<context position="5081" citStr="Lafferty et al., 2001" startWordPosition="852" endWordPosition="855">usicartist The name of music groups or disc jockeys (DJs) e.g., Taylor Swift and Lady Gaga other A phrase that can be used generally such as the name of a ceremony or an anniversary, or the title of a song e.g., X-mas and Murphy’s law person The name of a person; it can be the person’s full name, last name, or first name e.g., Steve King and Ellen product The name of a product e.g., Nokia 5800 and Coke sportsteam The name of a sports team e.g., Arsenal and West Ham tvshow The title of a television (TV) show e.g., The Persuaders and Pretty Little Liars 3 Model Conditional Random Fields (CRFs) (Lafferty et al., 2001) and its variants have been successfully applied to various sequence labeling tasks (Maaten et al., 2011; Collins, 2002; McCallum and Li, 2003; Kim and Snyder, 2012; Kim et al., 2015b; Kim et al., 2015a; Kim and Snyder, 2013a; Kim and Snyder, 2013b). The NER task produces a sequence of named entity tags, y = (y1 ... yn), given a sequence of words, x = (x1 ... xn). We model the conditional probability p(y|x; 0) using linear-chain CRFs: exp(0 · Φ(x, y)) p(y|x; 0) = EyIEY(x) exp(0 · Φ(x, y&apos;)) where 0 denotes a set of model parameters. Y returns all possible label sequences of x, and Φ maps (x, y)</context>
</contexts>
<marker>Lafferty, McCallum, Pereira, 2001</marker>
<rawString>John Lafferty, Andrew McCallum, and Fernando CN Pereira. 2001. Conditional random fields: Probabilistic models for segmenting and labeling sequence data.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Laurens Maaten</author>
<author>Max Welling</author>
<author>Lawrence K Saul</author>
</authors>
<title>Hidden-unit conditional random fields.</title>
<date>2011</date>
<booktitle>In International Conference on Artificial Intelligence and Statistics.</booktitle>
<contexts>
<context position="5185" citStr="Maaten et al., 2011" startWordPosition="868" endWordPosition="871">hat can be used generally such as the name of a ceremony or an anniversary, or the title of a song e.g., X-mas and Murphy’s law person The name of a person; it can be the person’s full name, last name, or first name e.g., Steve King and Ellen product The name of a product e.g., Nokia 5800 and Coke sportsteam The name of a sports team e.g., Arsenal and West Ham tvshow The title of a television (TV) show e.g., The Persuaders and Pretty Little Liars 3 Model Conditional Random Fields (CRFs) (Lafferty et al., 2001) and its variants have been successfully applied to various sequence labeling tasks (Maaten et al., 2011; Collins, 2002; McCallum and Li, 2003; Kim and Snyder, 2012; Kim et al., 2015b; Kim et al., 2015a; Kim and Snyder, 2013a; Kim and Snyder, 2013b). The NER task produces a sequence of named entity tags, y = (y1 ... yn), given a sequence of words, x = (x1 ... xn). We model the conditional probability p(y|x; 0) using linear-chain CRFs: exp(0 · Φ(x, y)) p(y|x; 0) = EyIEY(x) exp(0 · Φ(x, y&apos;)) where 0 denotes a set of model parameters. Y returns all possible label sequences of x, and Φ maps (x, y) into a feature vector that is a linear sum of the local feature vectors: Φ(x, y) = En j=1 O(x, j, yj−1,</context>
</contexts>
<marker>Maaten, Welling, Saul, 2011</marker>
<rawString>Laurens Maaten, Max Welling, and Lawrence K Saul. 2011. Hidden-unit conditional random fields. In International Conference on Artificial Intelligence and Statistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Michael Mathioudakis</author>
<author>Nick Koudas</author>
</authors>
<title>Twittermonitor: trend detection over the twitter stream.</title>
<date>2010</date>
<booktitle>In Proceedings of the 2010 ACM SIGMOD International Conference on Management of data,</booktitle>
<pages>1155--1158</pages>
<publisher>ACM.</publisher>
<contexts>
<context position="1620" citStr="Mathioudakis and Koudas, 2010" startWordPosition="252" endWordPosition="255">location, and organization, given a sequence of words. NER is a very important subtask of information extraction (IE). With the development of the Internet, a huge amount of information has been generated by users. The information generated on the Internet, particularly on social media (e.g., Twitter and Facebook), includes very diverse and noisy texts. The volume of Twitter data has increased rapidly, and about 500 million tweets are sent per day1. In recent years, Twitter data have considered a new source in nature and researchers are paying increased attention to them (Bollen et al., 2011; Mathioudakis and Koudas, 2010). Twitter is a type of microblogging service in which users are allowed to post contents such as small messages, individual images, or videos. There 1See “http://www.internetlivestats.com/twitter-statistics/” are a number of microblogging sites such as Twitter, Tumblr, Plurk and identi.ca. Each service has its own characteristics. For example, Plurk has a timeline view for videos and pictures, and Twitter has “status updates.” The characteristic of “status updates” is one of the features that makes the classification of named entities in Twitter difficult. In Twitter, there is a limit for the </context>
</contexts>
<marker>Mathioudakis, Koudas, 2010</marker>
<rawString>Michael Mathioudakis and Nick Koudas. 2010. Twittermonitor: trend detection over the twitter stream. In Proceedings of the 2010 ACM SIGMOD International Conference on Management of data, pages 1155–1158. ACM.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Andrew McCallum</author>
<author>Wei Li</author>
</authors>
<title>Early results for named entity recognition with conditional random fields, feature induction and web-enhanced lexicons.</title>
<date>2003</date>
<booktitle>In HLT-NAACL,</booktitle>
<pages>188--191</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="5223" citStr="McCallum and Li, 2003" startWordPosition="874" endWordPosition="877">e name of a ceremony or an anniversary, or the title of a song e.g., X-mas and Murphy’s law person The name of a person; it can be the person’s full name, last name, or first name e.g., Steve King and Ellen product The name of a product e.g., Nokia 5800 and Coke sportsteam The name of a sports team e.g., Arsenal and West Ham tvshow The title of a television (TV) show e.g., The Persuaders and Pretty Little Liars 3 Model Conditional Random Fields (CRFs) (Lafferty et al., 2001) and its variants have been successfully applied to various sequence labeling tasks (Maaten et al., 2011; Collins, 2002; McCallum and Li, 2003; Kim and Snyder, 2012; Kim et al., 2015b; Kim et al., 2015a; Kim and Snyder, 2013a; Kim and Snyder, 2013b). The NER task produces a sequence of named entity tags, y = (y1 ... yn), given a sequence of words, x = (x1 ... xn). We model the conditional probability p(y|x; 0) using linear-chain CRFs: exp(0 · Φ(x, y)) p(y|x; 0) = EyIEY(x) exp(0 · Φ(x, y&apos;)) where 0 denotes a set of model parameters. Y returns all possible label sequences of x, and Φ maps (x, y) into a feature vector that is a linear sum of the local feature vectors: Φ(x, y) = En j=1 O(x, j, yj−1, yj). Given the fully labeled sequence</context>
</contexts>
<marker>McCallum, Li, 2003</marker>
<rawString>Andrew McCallum and Wei Li. 2003. Early results for named entity recognition with conditional random fields, feature induction and web-enhanced lexicons. In HLT-NAACL, pages 188–191. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Tomas Mikolov</author>
<author>Ilya Sutskever</author>
<author>Kai Chen</author>
<author>Greg S Corrado</author>
<author>Jeff Dean</author>
</authors>
<title>Distributed representations of words and phrases and their compositionality. In Advances in neural information processing systems,</title>
<date>2013</date>
<pages>3111--3119</pages>
<contexts>
<context position="9389" citStr="Mikolov et al., 2013" startWordPosition="1594" endWordPosition="1597">to a binary tree of classes (Brown et al., 1992). We downloaded a brown clustering4 based on Wikipedia provided by Turian et al. (2010). We used whole bit string of the current word. 4.4 Word representation As a new source, tweet data are not applicable to the traditional model because of the different text structure. For a new model, it is natural to use annotated data. However, it is difficult to create new labeled data for a rapid generation of tweets. Instead of constantly annotate new data, the general solution is creating induced word representations from a large body of unlabeled data (Mikolov et al., 2013; Pennington et al., 2014; Kim et al., 2014; Anastasakos et al., 2014). A lot of previous work have used CCA because of its simplicity and generality (Kim et al., 2015c; Kim et al., 2015d; Stratos et al., 2014; Kim et al., 2015b). We create a word representation by using the canonical correlation analysis (Hotelling, 1936). Furthermore, word embeddings are induced from 13 million tweets containing 270 million tokens. The dimension of word embeddings we used is 50 with words occurring more than twice in the data. The window size for the contextual information is 3: the current word and a word t</context>
</contexts>
<marker>Mikolov, Sutskever, Chen, Corrado, Dean, 2013</marker>
<rawString>Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg S Corrado, and Jeff Dean. 2013. Distributed representations of words and phrases and their compositionality. In Advances in neural information processing systems, pages 3111–3119.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Jeffrey Pennington</author>
<author>Richard Socher</author>
<author>Christopher D Manning</author>
</authors>
<title>Glove: Global vectors for word representation.</title>
<date>2014</date>
<booktitle>Proceedings of the Empiricial Methods in Natural Language Processing (EMNLP 2014),</booktitle>
<pages>12--1532</pages>
<contexts>
<context position="9414" citStr="Pennington et al., 2014" startWordPosition="1598" endWordPosition="1601">asses (Brown et al., 1992). We downloaded a brown clustering4 based on Wikipedia provided by Turian et al. (2010). We used whole bit string of the current word. 4.4 Word representation As a new source, tweet data are not applicable to the traditional model because of the different text structure. For a new model, it is natural to use annotated data. However, it is difficult to create new labeled data for a rapid generation of tweets. Instead of constantly annotate new data, the general solution is creating induced word representations from a large body of unlabeled data (Mikolov et al., 2013; Pennington et al., 2014; Kim et al., 2014; Anastasakos et al., 2014). A lot of previous work have used CCA because of its simplicity and generality (Kim et al., 2015c; Kim et al., 2015d; Stratos et al., 2014; Kim et al., 2015b). We create a word representation by using the canonical correlation analysis (Hotelling, 1936). Furthermore, word embeddings are induced from 13 million tweets containing 270 million tokens. The dimension of word embeddings we used is 50 with words occurring more than twice in the data. The window size for the contextual information is 3: the current word and a word to the left and the right </context>
</contexts>
<marker>Pennington, Socher, Manning, 2014</marker>
<rawString>Jeffrey Pennington, Richard Socher, and Christopher D Manning. 2014. Glove: Global vectors for word representation. Proceedings of the Empiricial Methods in Natural Language Processing (EMNLP 2014), 12:1532–1543.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Alan Ritter</author>
<author>Sam Clark</author>
<author>Oren Etzioni</author>
</authors>
<title>Named entity recognition in tweets: an experimental study.</title>
<date>2011</date>
<booktitle>In Proceedings of the Conference on Empirical Methods in Natural Language Processing,</booktitle>
<pages>1524--1534</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="2429" citStr="Ritter et al., 2011" startWordPosition="377" endWordPosition="381">witter-statistics/” are a number of microblogging sites such as Twitter, Tumblr, Plurk and identi.ca. Each service has its own characteristics. For example, Plurk has a timeline view for videos and pictures, and Twitter has “status updates.” The characteristic of “status updates” is one of the features that makes the classification of named entities in Twitter difficult. In Twitter, there is a limit for the number of characters that people can post at once. People post their thoughts with a short sentence; this leads to the problem that tweets do not contain sufficient contextual information (Ritter et al., 2011). The shared task of ACL W-NUT 2015 is to find named entities on Twitter. Here, we will focus on ten types of named entities: company, facility, geo-loc, movie, musicartist, other, person, product, sportsteam, and tvshow. We have the training and development data for Twitter and 53 gazetteers from the abovementioned shared task. In this paper, we describe the datasets in Section 2 and present the model that we use in this study in Section 3. In Section 4, we discuss the features used and the methods used for generating these features. We present our final results along with their analysis in S</context>
<context position="8186" citStr="Ritter et al., 2011" startWordPosition="1390" endWordPosition="1393">rrent word has two digits. • Non-alphabet: Whether the current word contains a hyphen and other punctuation marks. Among the other punctuation marks is the colon(:). In general, what follows right after a colon mark represents a feature weight. To make the model learn correctly, we normalize only the colon mark. 4.2 POS tags and chunks In the NER task, POS tags and chunks contain very useful information for finding and classifying named entities. We predict POS tags and chunks by using a model trained with Twitter data. For POS tags, we use a model trained with the Penn Treebank-style tagset (Ritter et al., 2011). In a model, some Twitter-specific tags are added by Ritter et al. (2011): retweets, @usernames, #hashtags, and urls. For chunks, we use a named entity tagger3 by Ritter et al. (2012). Predicted tags are used as features as follows: • POS tag: a conjunction feature with the current word and the current POS tag w0|p0. • Chunk tag: a unigram feature for chunk tag c0 and a conjunction feature with the current word and the current chunk tag w0|c0. 3https://github.com/aritter/twitter nlp 4.3 Brown clustering Brown clustering is a hierarchical clustering method that groups words into a binary tree </context>
<context position="10215" citStr="Ritter et al., 2011" startWordPosition="1738" endWordPosition="1741">014; Kim et al., 2015b). We create a word representation by using the canonical correlation analysis (Hotelling, 1936). Furthermore, word embeddings are induced from 13 million tweets containing 270 million tokens. The dimension of word embeddings we used is 50 with words occurring more than twice in the data. The window size for the contextual information is 3: the current word and a word to the left and the right of the current word. 5 Results 5.1 Error analysis Twitter contains noisy and informal style text, and most of the state-of-art applications show a weak performance on Twitter data (Ritter et al., 2011). In this section, we check the errors for noisy text from the baseline system and categorize them. The last two errors are related to user-generated texts such as Twitter data. Unseen word sequences: The main cause of this error is in a previously unseen sequence. A huge number of tweets are posted on Twitter every day and they contain up-to-date information on events. The most recent information such as new product information can lead to the formation of unprecedented word sequences. These sequences do not appear in 4http://metaoptimize.com/projects/wordreprs/ 74 MnoEmbedding MEmbedding +/-</context>
</contexts>
<marker>Ritter, Clark, Etzioni, 2011</marker>
<rawString>Alan Ritter, Sam Clark, Oren Etzioni, et al. 2011. Named entity recognition in tweets: an experimental study. In Proceedings of the Conference on Empirical Methods in Natural Language Processing, pages 1524–1534. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Alan Ritter</author>
<author>Oren Etzioni Mausam</author>
<author>Sam Clark</author>
</authors>
<title>Open domain event extraction from twitter.</title>
<date>2012</date>
<booktitle>In KDD.</booktitle>
<contexts>
<context position="8370" citStr="Ritter et al. (2012)" startWordPosition="1422" endWordPosition="1425">ollows right after a colon mark represents a feature weight. To make the model learn correctly, we normalize only the colon mark. 4.2 POS tags and chunks In the NER task, POS tags and chunks contain very useful information for finding and classifying named entities. We predict POS tags and chunks by using a model trained with Twitter data. For POS tags, we use a model trained with the Penn Treebank-style tagset (Ritter et al., 2011). In a model, some Twitter-specific tags are added by Ritter et al. (2011): retweets, @usernames, #hashtags, and urls. For chunks, we use a named entity tagger3 by Ritter et al. (2012). Predicted tags are used as features as follows: • POS tag: a conjunction feature with the current word and the current POS tag w0|p0. • Chunk tag: a unigram feature for chunk tag c0 and a conjunction feature with the current word and the current chunk tag w0|c0. 3https://github.com/aritter/twitter nlp 4.3 Brown clustering Brown clustering is a hierarchical clustering method that groups words into a binary tree of classes (Brown et al., 1992). We downloaded a brown clustering4 based on Wikipedia provided by Turian et al. (2010). We used whole bit string of the current word. 4.4 Word represent</context>
</contexts>
<marker>Ritter, Mausam, Clark, 2012</marker>
<rawString>Alan Ritter, Mausam, Oren Etzioni, and Sam Clark. 2012. Open domain event extraction from twitter. In KDD.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Karl Stratos</author>
<author>Do-kyum Kim</author>
<author>Michael Collins</author>
<author>Daniel Hsu</author>
</authors>
<title>A spectral algorithm for learning class-based n-gram models of natural language.</title>
<date>2014</date>
<booktitle>In UAI.</booktitle>
<contexts>
<context position="9598" citStr="Stratos et al., 2014" startWordPosition="1634" endWordPosition="1637"> a new source, tweet data are not applicable to the traditional model because of the different text structure. For a new model, it is natural to use annotated data. However, it is difficult to create new labeled data for a rapid generation of tweets. Instead of constantly annotate new data, the general solution is creating induced word representations from a large body of unlabeled data (Mikolov et al., 2013; Pennington et al., 2014; Kim et al., 2014; Anastasakos et al., 2014). A lot of previous work have used CCA because of its simplicity and generality (Kim et al., 2015c; Kim et al., 2015d; Stratos et al., 2014; Kim et al., 2015b). We create a word representation by using the canonical correlation analysis (Hotelling, 1936). Furthermore, word embeddings are induced from 13 million tweets containing 270 million tokens. The dimension of word embeddings we used is 50 with words occurring more than twice in the data. The window size for the contextual information is 3: the current word and a word to the left and the right of the current word. 5 Results 5.1 Error analysis Twitter contains noisy and informal style text, and most of the state-of-art applications show a weak performance on Twitter data (Rit</context>
</contexts>
<marker>Stratos, Kim, Collins, Hsu, 2014</marker>
<rawString>Karl Stratos, Do-kyum Kim, Michael Collins, and Daniel Hsu. 2014. A spectral algorithm for learning class-based n-gram models of natural language. In UAI.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Joseph Turian</author>
<author>Lev Ratinov</author>
<author>Yoshua Bengio</author>
</authors>
<title>Word representations: a simple and general method for semi-supervised learning.</title>
<date>2010</date>
<booktitle>In Proceedings of the 48th annual meeting of the association for computational linguistics,</booktitle>
<pages>384--394</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="8904" citStr="Turian et al. (2010)" startWordPosition="1510" endWordPosition="1513">#hashtags, and urls. For chunks, we use a named entity tagger3 by Ritter et al. (2012). Predicted tags are used as features as follows: • POS tag: a conjunction feature with the current word and the current POS tag w0|p0. • Chunk tag: a unigram feature for chunk tag c0 and a conjunction feature with the current word and the current chunk tag w0|c0. 3https://github.com/aritter/twitter nlp 4.3 Brown clustering Brown clustering is a hierarchical clustering method that groups words into a binary tree of classes (Brown et al., 1992). We downloaded a brown clustering4 based on Wikipedia provided by Turian et al. (2010). We used whole bit string of the current word. 4.4 Word representation As a new source, tweet data are not applicable to the traditional model because of the different text structure. For a new model, it is natural to use annotated data. However, it is difficult to create new labeled data for a rapid generation of tweets. Instead of constantly annotate new data, the general solution is creating induced word representations from a large body of unlabeled data (Mikolov et al., 2013; Pennington et al., 2014; Kim et al., 2014; Anastasakos et al., 2014). A lot of previous work have used CCA becaus</context>
</contexts>
<marker>Turian, Ratinov, Bengio, 2010</marker>
<rawString>Joseph Turian, Lev Ratinov, and Yoshua Bengio. 2010. Word representations: a simple and general method for semi-supervised learning. In Proceedings of the 48th annual meeting of the association for computational linguistics, pages 384–394. Association for Computational Linguistics.</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>