<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.000010">
<title confidence="0.998989">
Shared Tasks of the 2015 Workshop on Noisy User-generated Text:
Twitter Lexical Normalization and Named Entity Recognition
</title>
<author confidence="0.997965">
Timothy Baldwin
</author>
<affiliation confidence="0.998462">
University of Melbourne
</affiliation>
<email confidence="0.967503">
tb@ldwin.net
</email>
<author confidence="0.99057">
Young-Bum Kim
</author>
<affiliation confidence="0.996877">
University of Wisconsin
</affiliation>
<email confidence="0.997361">
ybkim@cs.wisc.edu
</email>
<author confidence="0.824847">
Marie Catherine de Marneffe
</author>
<affiliation confidence="0.855859">
The Ohio State University
</affiliation>
<email confidence="0.90568">
demarneffe.1@osu.edu
</email>
<author confidence="0.988438">
Alan Ritter
</author>
<affiliation confidence="0.990655">
The Ohio State University
</affiliation>
<email confidence="0.995094">
ritter.1492@osu.edu
</email>
<author confidence="0.949217">
Bo Han
</author>
<affiliation confidence="0.839837">
IBM Research
</affiliation>
<email confidence="0.91117">
bohan.ibm@au1.ibm.com
</email>
<author confidence="0.996201">
Wei Xu
</author>
<affiliation confidence="0.998805">
University of Pennsylvania
</affiliation>
<email confidence="0.99767">
xwe@cis.upenn.edu
</email>
<sectionHeader confidence="0.993872" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.999334111111111">
This paper presents the results of the
two shared tasks associated with W-NUT
2015: (1) a text normalization task with
10 participants; and (2) a named entity
tagging task with 8 participants. We
outline the task, annotation process and
dataset statistics, and provide a high-level
overview of the participating systems for
each shared task.
</bodyText>
<sectionHeader confidence="0.999263" genericHeader="introduction">
1 Introduction
</sectionHeader>
<bodyText confidence="0.999380783783784">
As part of the 2015 ACL-IJCNLP Workshop on
Noisy User-generated Text (W-NUT), we orga-
nized two shared tasks: (1) a text normalization
task (Section 2); and (2) a named entity tagging
task (Section 3).
In the text normalization task, participants were
asked to convert non-standard words to their stan-
dard forms for English tweets. Participating sys-
tems were classified by their use of resources, into
a constrained and an unconstrained category: con-
strained systems were permitted to use only the
provided training data and off-the-shelf tools; un-
constrained systems, on the other hand, were free
to use any public tools and resources. There were
6 official submissions in the constrained category,
and 5 official submissions in the unconstrained
category. Overall, deep learning methods and
methods based on lexicon-augmented conditional
random fields (CRFs) achieved the best results.
The winning team achieved a precision of 0.9061
precision, recall of 0.7865, and F1 of 0.8421.
The named entity recognition task attracted 8
participants. The majority of teams built their sys-
tems using linear-chain conditional random fields
(Lafferty et al., 2001), and many teams also
used brown clusters and word embedding fea-
tures (Turian et al., 2010). Notable new tech-
niques for named entity recognition in Twitter in-
clude a semi-Markov MIRA trained tagger (nrc),
an end-to-end neural network using no hand-
engineered features (multimedialab), an approach
that weights training data to compensate for con-
cept drift (USFD), and a differential evolution ap-
proach to feature selection (iitp). The submission
from the winning team (ousia) achieved supris-
ingly good performance on this difficult task, near
the level of inter-rater agreement.
</bodyText>
<sectionHeader confidence="0.975445" genericHeader="method">
2 Text Normalization Shared Task
</sectionHeader>
<bodyText confidence="0.858777">
In this section, we outline the Twitter Text Nor-
malization Shared Task, describing the data and
annotation process, and outlining the approaches
adopted by participants.
</bodyText>
<subsectionHeader confidence="0.899763">
2.1 Background
</subsectionHeader>
<bodyText confidence="0.9953504">
Non-standard words are present in many text gen-
res, including advertisements, professional fo-
rums, and SMS messages. They can be the cause
of reading and understanding problems for hu-
mans, and degrade the accuracy of text process-
ing tools (Han et al., 2013; Plank et al., 2014a;
Kong et al., 2014). Text normalization aims to
transform non-standard words to their canonical
forms (Sproat et al., 2001; Han and Baldwin,
2011) as shown in Figure 1. Common examples
of non-standard words include abbreviations (e.g.,
u “you”), and non-standard spellings (e.g., cuming
“coming” or 2mr “tomorrow”). The prevalence of
non-standard words in social media text results in
markedly higher out-of-vocabulary (OOV) rates;
normalizing the text brings OOV rates down to
more conventional levels and makes the text more
amenable to automatic processing with off-the-
shelf tools which have been trained on edited text.
Text normalization over Twitter data has been
addressed at different granularities. For instance,
non-standard words can be considered as spelling
errors at the character (Liu et al., 2011) or
word level (Wang and Ng, 2013). Text nor-
malization can also be approached as a machine
</bodyText>
<page confidence="0.990969">
126
</page>
<note confidence="0.9952445">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 126–135,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<figureCaption confidence="0.999847">
Figure 1: Normalization examples
</figureCaption>
<bodyText confidence="0.995167631578948">
translation task, whereby non-standard words are
mapped to more canonical expressions (Aw et
al., 2006). Other approaches have involved deep
learning (Chrupała, 2014), cognitively-inspired
approaches (Liu et al., 2012), random walks (Has-
san and Menezes, 2013), and supervision us-
ing automatically-mined parallel data (Ling et al.,
2013).
One major challenge in text normalization re-
search has been the lack of annotated data for
training and evaluating methods. As a result, most
Twitter text normalization methods have been un-
supervised or semi-supervised (Cook and Steven-
son, 2009; Han et al., 2012; Yang and Eisen-
stein, 2013), and evaluated over small-scale hand-
annotated datasets. This has hampered analysis of
the strengths and weaknesses of individual meth-
ods, and was our motivation in organizing the lex-
ical normalization shared task.
</bodyText>
<subsectionHeader confidence="0.998295">
2.2 Shared Task Design
</subsectionHeader>
<bodyText confidence="0.999814263157895">
This lexical normalization shared task is focused
exclusively on English, and was designed with
three primary desiderata in mind: (1) to construct a
much larger dataset than existing resources; (2) to
allow all of 1:1, 1:N and N:1 word n-gramm ap-
pings; and (3) to cover not just OOV non-standard
words but also non-standard words that happen to
coincide in spelling with standard words. In all
three regards, the shared task expands upon the
scope of the de facto evaluation datasets of Han
and Baldwin (2011) and Liu et al. (2011).
One constraint that was placed on candidate to-
kens for normalization was that they should be
all-alphanumeric. For normalization, we adopted
American spelling.
In order to establish a more level playing field
for participants, but also encourage the use of
a wide range of resources, participants were re-
quired to nominate their system categories:
</bodyText>
<listItem confidence="0.770187545454545">
• Constrained: participants could not use any
data other than the provided training data to
perform the text normalization task. They
were allowed to use pre-trained tools (e.g.,
Twitter POS taggers), but no normalization
lexicons or extra tweet data.
• Unconstrained: participants could use any
publicly accessible data or tools to perform
the text normalization task.
Evaluation was based on token-level precision,
recall and F-score.
</listItem>
<subsectionHeader confidence="0.909476">
2.2.1 Preprocessing
</subsectionHeader>
<bodyText confidence="0.999982631578947">
We first collected tweets using the Twitter Stream-
ing API over the period 23–29 May, 2014, and
then used langid.py (Lui and Baldwin, 2012)1
to remove all non-English tweets. Tokenization
was performed with CMU-ARK tokeniser.2
To ensure that tweets had a high likelihood of
requiring lexical normalization, we filtered out
tweets with less than 2 non-standard words (i.e.
words not occurring in our dictionary — see Sec-
tion 2.2.3). While this biases the sample of tweets,
the decision was made at a pragmatic level to en-
sure a reasonable level of lexical normalization
and “annotation density”. This was based on a pi-
lot study over a random sample of English tweets,
in which we found that many non-standard words
were actually unknown named entities which did
not require normalization. In all, 5,200 randomly-
sampled English tweets were annotated for the
shared task dataset.
</bodyText>
<subsectionHeader confidence="0.834938">
2.2.2 Annotation
</subsectionHeader>
<bodyText confidence="0.724365">
12 interns and employees at IBM Research Aus-
tralia were involved in the data annotation. All
</bodyText>
<footnote confidence="0.993552333333333">
1https://github.com/saffsd/langid.py
2https://github.com/myleott/
ark-twokenize-py
</footnote>
<page confidence="0.996893">
127
</page>
<bodyText confidence="0.999964789473684">
annotators had a high level of English proficiency
(IELTS &gt; 6.0) and were reasonably familiar with
Twitter data. Each annotator labeled at least 200
tweets, and each tweet was independently labeled
by two annotators based on the annotation guide-
lines.3 As part of this, any non-English tweets
misclassified by langid.py were manually re-
moved from the dataset. This resulted in the fi-
nal size of the annotated dataset dropping to 4,917
tweets. All annotations were completed within
two weeks, and achieved an average Cohen’s r. of
0.5854.
For all instances of annotator disagreement, an
annotator who was not involved in the first-pass
annotation process was asked to adjudicate in the
following week. During the course of the shared
task, we additionally examined and incorporated
a small number of annotation corrections reported
by participants.
</bodyText>
<subsectionHeader confidence="0.488569">
2.2.3 English Lexicon
</subsectionHeader>
<bodyText confidence="0.999904590909091">
It is impossible to reach consensus on the di-
viding line between standard words and non-
standard words (e.g. are footie, y’all and youse
non-standard or standard words?). We artificially
arrive at such a dividing line via membership in
a prescribed lexicon of English. Specifically, we
use the SCOWL database with American spellings
as the default English lexicon.4 The SCOWL
database integrates words from multiple sources
and also contains valid word spelling variations,
which makes it an excellent English lexicon for
this shared task. As suggested in the database
guidelines, we used a dictionary size of 70%, such
that the lexicon contains words found in most dic-
tionaries, but also many high-frequency proper
nouns such as Obama and Facebook.
The overall English lexicon (after de-
duplication) contains 165,458 words. This
lexicon was used: (a) to pre-filter data, i.e., tweets
with less than two tokens not in this lexicon are
dropped from our annotations; and (b) as the basis
of the standard words for normalization.
</bodyText>
<subsectionHeader confidence="0.589697">
2.2.4 Dataset Statistics
</subsectionHeader>
<bodyText confidence="0.99991975">
The dataset was randomly split 60:40, into 2,950
tweets for the training data and 1,967 tweets for
the test data. Table 1 details the number of (possi-
bly multi-word) tokens in each of the training and
</bodyText>
<footnote confidence="0.997575333333333">
3http://noisy-text.github.io/files/
annotation_guideline_v1.1.pdf
4Version 2014.11.17 was used.
</footnote>
<table confidence="0.99910875">
Category 1:1 1:N N:1 Overall
Training 2,875 1,043 10 3,928
Test 2,024 704 10 2,738
Training ratio 0.587 0.597 0.500 0.589
</table>
<tableCaption confidence="0.994886">
Table 1: Numbers of non-standard words in the
</tableCaption>
<bodyText confidence="0.869659">
training and test datasets for the lexical normal-
ization task, broken down into 1:1, 1:N and N:1
mappings from non-standard words to standard
words. “Training ratio” represents the number of
non-standard words in the training data divided by
the overall non-standard words in that category.
</bodyText>
<figure confidence="0.971828818181818">
Rank Training Test
1 u 333 u 236
2 lol 272 lol 197
3 im 182 im 154
4 dont 92 nigga 60
5 omg 67 dont 57
6 nigga 57 lmao 45
7 niggas 52 n 43
8 lmao 51 niggas 42
9 n 49 omg 34
10 ur 46 ur 28
</figure>
<tableCaption confidence="0.515462">
Table 2: Top-10 most frequent non-standard words
</tableCaption>
<bodyText confidence="0.97363945">
in each partition of the lexical normalization
dataset.
test data that were normalized based on a 1:1, 1:N
or N:1 mapping. We additionally include the pro-
portion of tokens in each category that were con-
tained in the test data, to confirm that the dataset
is relatively balanced in composition between the
training and test partitions.
Overall, 373 non-standard word types were
found in the intersection of the training and test
data. The number of non-standard word types
unique to the training and test partitions was 777
and 488, respectively. We further show the top-
10 most frequent non-standard words and their to-
ken frequencies in the training, test and combined
datasets in Table 2. Despite the large number of
unique non-standard word in the training and test
partitions, there is relatively strong agreement in
the high-frequency non-standard words across the
dataset partitions.
</bodyText>
<equation confidence="0.978999454545454">
Combined
u 569
lol 469
im 336
dont 149
nigga 117
omg 101
lmao 96
niggas 94
n 92
ur 74
</equation>
<page confidence="0.982739">
128
</page>
<subsectionHeader confidence="0.6911">
2.3 Normalization Approaches and
</subsectionHeader>
<sectionHeader confidence="0.740964" genericHeader="method">
Discussion
</sectionHeader>
<bodyText confidence="0.980204950617284">
Overall, 10 teams submitted official runs to the
shared task: 6 teams participated in the con-
strained category, 5 teams in the unconstrained
category, and 1 team in both categories.5 The
normalization results for each category are shown
in Tables 3 and 4. Overall, common approaches
were lexicon-based methods, CRFs, and neu-
ral network-based approaches. Among the con-
strained systems, neural networks achieved strong
results, even without off-the-shelf tools. In con-
trast, CRF- and lexicon-based approaches were
shown to be effective in the unconstrained cat-
egory. Surprisingly, the best overall result was
achieved by a constrained system, suggesting
that the relative advantage in accessing additional
datasets or resources has less impact than the qual-
ity of the underlying model that is used to model
the task.
NCSU SAS NING (Jin, 2015) Normalization
candidates were generated based on the training
data, and scored based on Jaccard index over
character n-gram[ s]. Candidates were evaluated
using random forest classifiers to offset parameter
sensitivity, using features including normalization
statistics, string similarity and POS.
NCSU SAS WOOKHEE (Min et al., 2015)
Word-level edits are predicted based on long-short
term memory (LSTM) recurrent neural networks
(RNN), using character sequences and POS tags
as features. The LSTM is further complemented
with a normalization lexicon induced from the
training data.
NCSU SAS SAM (Leeman-Munk et al., 2015)
Two forward feed neural networks are used to pre-
dict: (1) the normalized token given an input to-
ken; and (2) whether a word should be normalized
or left intact. Normalized tokens are further edited
by a “conformer” which down-weights rare words
as normalization candidates.
IITP (Akhtar et al., 2015b) A CRF model is
trained over the training data, with features in-
cluding word sequences, POS tags and morphol-
ogy features. Post-processing heuristics are used
to post-edit the output of the CRF.
5One team (GIGO) didn’t submit a description paper.
DCU-ADAPT (Wagner and Foster, 2015) A
generalized perceptron method is used generate
word edit operations, with features including char-
acter n-gram[ s], character classes, and RNN lan-
guage model hidden layer activation features. The
final normalization word is selected based on the
noisy channel model with a character language
model.
IHD RD (Supranovich and Patsepnia, 2015)
non-standard words are identified using a CRF
tagger, using features such as token-level features,
contextual tokens, dictionary lookup, and edit dis-
tance. Multiple lexicons are combined to gener-
ate normalization candidates. A query misspelling
correction module (i.e., DidYouMean) is used to
post-process the output.
USZEGED (Berend and Tasn´adi, 2015) A CRF
model is used to identify tokens requiring normal-
ization, and determine the type of normalization
required. Normalization candidates are then pro-
posed based on revised edit distance. The final
normalization candidate is selected on the basis of
n-grams tatistics.
BEKLI (Beckley, 2015) A substitution dictio-
nary is constructed in which keys are non-standard
words and values are lists of potential normaliza-
tions. Frequent morphology errors are captured by
hand-crafted rules. Finally, the Viterbi algorithm
is applied to bigram sequences to decode the nor-
malized sentence with maximum probability.
LYSGROUP (Mosquera et al., 2015) A system
originally developed for Spanish text normaliza-
tion was adapted to English text normalization.
The method consists of a cascaded pipeline of sev-
eral data adaptors and processors, such as a Twitter
POS tagger and a spell checker.
</bodyText>
<sectionHeader confidence="0.9811435" genericHeader="method">
3 Named Entity Recognition over
Twitter
</sectionHeader>
<bodyText confidence="0.999465444444444">
The second shared task of WNUT2015 is named
entity recognition over Twitter data. Named en-
tity recognition is a crucial component in many
information extraction pipelines, however the ma-
jority of available NER tools were developed for
newswire text and perform poorly on informal text
genres such as Twitter. While performance on
named entity recognition in newswire is quite high
(Tjong Kim Sang and De Meulder, 2003), state-
</bodyText>
<page confidence="0.984164">
129
</page>
<table confidence="0.999862714285714">
Team name Precision Recall F1 Method highlights
NCSU SAS NING 0.9061 0.7865 0.8421 Random Forest
NCSU SAS WOOKHEE 0.9136 0.7398 0.8175 Lexicon + LSTM
NCSU SAS SAM 0.9012 0.7437 0.8149 ANN
IITP 0.9026 0.7191 0.8005 CRF + Rule
DCU-ADAPT 0.8190 0.5509 0.6587 Generalized Perceptron
LYSGROUP 0.4646 0.6281 0.5341 Spanish Normalization Adaption
</table>
<tableCaption confidence="0.996462">
Table 3: Results of the constrained systems for the lexical normalization shared task
</tableCaption>
<table confidence="0.999854166666667">
Team name Precision Recall F1 Method highlights
IHS RD 0.8469 0.8083 0.8272 Lexicon + CRF + DidYouMean
USZEGED 0.8606 0.7564 0.8052 CRF + n-gram[ s]
BEKLI 0.7743 0.7416 0.7571 Lexicon + Rule + Ranker
GIGO 0.7593 0.6963 0.7264 N/A
LYSGROUP 0.4592 0.6296 0.5310 Spanish Normalization Adaption
</table>
<tableCaption confidence="0.999871">
Table 4: Results of the unconstrained systems for the lexical normalization shared task
</tableCaption>
<bodyText confidence="0.996398548387097">
of-the-art performance on Twitter data lags far be-
hind.
The diverse and noisy style of user-generated
content presents serious challenges. For instance
tweets, unlike edited newswire text, contain nu-
merous nonstandard spellings, abbreviations, un-
reliable capitalization, etc.
Another challenge is concept drift (Dredze et
al., 2010; Fromreide et al., 2014); the distribu-
tion of language and topics on Twitter is constantly
shifting leading to degraded performance of NLP
tools over time. To evaluate the effect of drift in
a realistic scenario, the current evaluation uses a
test set from a separate time period, which was not
announced to participants until the (unannotated)
test data was released at the beginning of the eval-
uation period.
To address these challenges, there has been an
increasing body of work on adapting named entity
recognition tools to noisy social media text (Der-
czynski et al., 2015b; Plank et al., 2014a; Cherry
and Guo, 2015; Ritter et al., 2011; Plank et al.,
2014b), however different research groups have
made use of different evaluation setups (e.g. train-
ing / test splits) making it challenging to perform
direct comparisons across systems. By organiz-
ing a shared evaluation we hope to help establish a
common evaluation methodology (for at least one
dataset) and also promote research and develop-
ment of NLP tools for user-generated social media
text genres.
</bodyText>
<subsectionHeader confidence="0.992835">
3.1 Training and Development Data
</subsectionHeader>
<bodyText confidence="0.9999885">
The training and development data for our task
was taken from previous work on Twitter NER
(Ritter et al., 2011), which distinguishes 10 dif-
ferent named entity types (see Table 5 for the set
of types). The data was split into 1,795 annotated
tweets for training (train) and 599 as a devel-
opment set (dev). Participants were allowed to
use the development data for training purposes in
their final submissions. This data was gathered in
September 2010 and annotated by the 5th author.
</bodyText>
<subsectionHeader confidence="0.999904">
3.2 Test Data Annotation
</subsectionHeader>
<bodyText confidence="0.999989928571429">
The test data was randomly sampled from Decem-
ber 2014 through February 2015. Two native En-
glish speakers were recruited to independently an-
notate the test data. The annotators were presented
with a set of simple guidelines6 that cover com-
mon ambiguous cases and also instructed to re-
fer to the September 2010 data for reference. The
BRAT tool7 was used for annotation. A screenshot
of the interface presented to annotators is shown
in Figure 2. During an initial training period,
both annotators independently labeled a set of 200
tweets after which disagreements were discussed
and resolved before moving on to annotate the fi-
nal test set. This initial annotation was only done
</bodyText>
<footnote confidence="0.9996675">
6http://bit.ly/1FSP6i2
7http://brat.nlplab.org/
</footnote>
<page confidence="0.997474">
130
</page>
<bodyText confidence="0.999940090909091">
for the purpose of training the annotators and the
resulting data was discarded.
The annotators then went on to double-annotate
a set of 1,425 messages. An adjudicator, the an-
notator of the training and dev sets, went through
each message and resolved disagreements. The
dataset was randomly split into 425 messages as
an additional development set (dev2015) which
was released to participants at the beginning of the
evaluation period. The remaining 1,000 messages
(test) were used for the final evaluation; annota-
tions on the test data were withheld from partici-
pants until the end of the evaluation period.
Table 5 presents precision and recall for each of
the 10 categories treating one annotator’s labels as
gold and the other’s as predicted. This exposes the
challenging nature of this annotation task and can
be viewed as a kind of human upper bound on pos-
sible system performance, though we believe the
consistency of the final annotations to be some-
what higher due to the second pass made by the
adjudicator. The value of Cohen’s r. as measured
on word-level annotations is 0.607.
A baseline system was provided to participants
which takes a simple approach based on CRF-
suite8 using a standard set of features which in-
clude contextual, orthographic and gazetteers gen-
erated from Freebase (Bollacker et al., 2008). The
evaluation consisted of 2 sub-tasks: one in which
participants’ systems were required to segment
and classify 10 named entity types and one where
the task is only to predict entity segmentation (no
types).
</bodyText>
<subsectionHeader confidence="0.991327">
3.3 Approaches
</subsectionHeader>
<bodyText confidence="0.999868785714286">
Eight teams (Table 6) participated in the named
entity recognition shared task. A wide variety of
approaches were taken to tackle this task. Table 7
summarizes the features used by each team and
the machine learning approach taken. Many teams
made use of word embeddings and Brown clus-
ters as features. One team (multimedialab) used
absolutely no hand-engineered features, relying
entirely on word embeddings and a feed-forward
neural-network (FFNN) architecture (Godin et al.,
2015). Other new approaches to Twitter NER in-
clude a semi-Markov MIRA trained tagger devel-
oped by the NRC team (Cherry and Guo, 2015)
and the use of entity-linking based features by ou-
</bodyText>
<footnote confidence="0.987342">
8http://www.chokkan.org/software/
crfsuite/
</footnote>
<table confidence="0.999671">
Precision Recall F#_1
company 41.46 33.33 36.96
facility 50.00 66.67 57.14
geo-loc 63.57 70.09 66.67
movie 35.71 35.71 35.71
musicartist 60.98 47.17 53.19
other 48.21 50.00 49.09
person 60.42 80.56 69.05
product 44.83 19.12 26.80
sportsteam 75.00 71.74 73.33
tvshow 55.56 50.00 52.63
Overall 56.64 57.52 57.07
</table>
<tableCaption confidence="0.987937">
Table 5: Precision and recall comparing one an-
</tableCaption>
<bodyText confidence="0.98544475">
notator against the other. Cohen’s kappa between
the annotators was 0.607. Disagreements between
the annotators resolved by a 3rd adjudicator for the
final datasets.
</bodyText>
<table confidence="0.775558222222222">
Team ID Affiliation
Hallym Hallym University
iitp Indian Institute of Technology Patna
lattice University Paris 3
multimedialab UGent - iMinds
NLANGP Institute for Infocomm Research
nrc National Research Council Canada
ousia Studio Ousia
USFD University of Sheffield
</table>
<tableCaption confidence="0.9096075">
Table 6: Team ID and affiliation of the named en-
tity recognition shared task participants.
</tableCaption>
<bodyText confidence="0.986657875">
sia (Yamada et al., 2015). All the other teams used
CRFs. On top of a CRF, the iitp team used a differ-
ential evolution based technique to obtain an opti-
mal feature set.
Most systems used the training data as well as
both dev sets provided to train their system, ex-
cept multimedialab which did not use dev2015
as training data and NRC which only used train.
</bodyText>
<page confidence="0.602115">
9
</page>
<bodyText confidence="0.99865875">
Tables 8 and 9 report the results obtained by
each team for segmentation and classification of
the 10 named entity types and for segmentation
only, respectively.
</bodyText>
<subsectionHeader confidence="0.958666">
3.4 System Descriptions
</subsectionHeader>
<bodyText confidence="0.988211">
Following is a brief description of the approach
taken by each team:
</bodyText>
<footnote confidence="0.994658333333333">
9A post-competition analysis of the effect of training on
development sets is presented in the NRC system description
paper (Cherry et al., 2015).
</footnote>
<page confidence="0.996182">
131
</page>
<figureCaption confidence="0.997728">
Figure 2: Annotation interface.
</figureCaption>
<table confidence="0.9993683">
POS Orthographic Gazetteers Brown clustering Word embedding ML
BASELINE – ✓ ✓ – – CRFsuite
Hallym ✓ – – ✓ correlation analysis CRFsuite
iitp ✓ ✓ ✓ – – CRF++
lattice ✓ ✓ – ✓ – CRF wapiti
multimedialab – – – – word2vec FFNN
NLANGP – ✓ ✓ ✓ word2vec &amp; GloVe CRF++
nrc – – ✓ ✓ word2vec semi-Markov MIRA
ousia ✓ ✓ ✓ – ✓ entity linking
USFD ✓ ✓ ✓ ✓ CRF L-BFGS
</table>
<tableCaption confidence="0.513418">
Table 7: Features and machine learning approach taken by each team.
</tableCaption>
<table confidence="0.9999915">
Precision Recall F#=1 Precision Recall F#=1
ousia 57.66 55.22 56.41 ousia 72.20 69.14 70.63
NLANGP 63.62 43.12 51.40 NLANGP 67.74 54.31 60.29
nrc 53.24 38.58 44.74 USFD 63.81 56.28 59.81
multimedialab 49.52 39.18 43.75 multimedialab 62.93 55.22 58.82
USFD 45.72 39.64 42.46 nrc 62.13 54.61 58.13
iitp 60.68 29.65 39.84 iitp 63.43 51.44 56.81
Hallym 39.59 35.10 37.21 Hallym 58.36 48.5 53.01
lattice 55.17 9.68 16.47 lattice 58.42 25.72 35.71
BASELINE 35.56 29.05 31.97 BASELINE 53.86 46.44 49.88
</table>
<tableCaption confidence="0.9854205">
Table 8: Results segmenting and categorizing en-
tities into 10 types.
</tableCaption>
<subsectionHeader confidence="0.694704">
Hallym (Yang and Kim, 2015) The Hallym
</subsectionHeader>
<bodyText confidence="0.8927814">
team used an approach based on CRFs using
both Brown clusters and word embeddings
trained using Canonical Correlation Analysis
as features.
iitp (Akhtar et al., 2015a) The iitp team pro-
</bodyText>
<tableCaption confidence="0.961378">
Table 9: Results on segmentation only (no types).
</tableCaption>
<bodyText confidence="0.997338285714286">
posed a multi-objective differential evolution
based technique for feature selection in twit-
ter named entity recognition.
lattice (Tian, 2015) Lattice employed a CRF
model using Wapiti. The feature templates
consisted of standard features used in state-
of-the-art. They trained first a model with
</bodyText>
<page confidence="0.995204">
132
</page>
<bodyText confidence="0.998463903225807">
dev 2015 and evaluated this model on train
and dev.
multimedialab (Godin et al., 2015) The goal of
the multimedia lab system was to only use
neural networks and word embeddings to
show the power of automatic feature learn-
ing and semi-supervised methods. A Feed-
Forward Neural Network was first trained,
that used only word2vec word embeddings
as input. Word embeddings were trained on
400 million unlabeled tweets. Leaky ReLUs
were used as activation function in combina-
tion with dropout to prevent overfitting. A
context window of 5 words was used As in-
put (2 words left and right). The output is a
single tag of the middle word. Afterwards, a
rule-based post-processing step was executed
to ensure every I-tag has a B-tag in front of
it and that all tags within a single span are of
the same type. Train and dev were used as
training data and used dev 2015 as validation
set.
NLANGP (Toh et al., 2015) The NLANGP team
modeled the problem as a sequential labeling
task and used Conditional Random Fields.
Several post-processing steps (e.g. rule-
based matching) were applied to refine the
system output. Besides Brown clusters, K-
means clusters were also used; the K-means
clusters were generated based on word em-
beddings.
nrc (Cherry et al., 2015) NRC applied a MIRA-
trained semi-Markov tagger with Gazetteer,
Brown cluster and Word Embedding fea-
tures. The Word Embeddings were built over
phrases using Word2Vec’s phrase finder tool,
and were modified using an auto-encoder to
be predictive of Gazetteer membership.
ousia (Yamada et al., 2015) The main character-
istics of the ousia method is enhancing the
performance of Twitter named entity recog-
nition using entity linking. Once entity men-
tions are disambiguated to the knowledge
base entries, high-quality knowledge can be
easily extracted from a knowledge base such
as the popularity of the entity, the classes of
the entity, and the likelihood that the entity
appears in the given context. They adopted
supervised machine-learning with features
including the results of NER and various in-
formation of the entity in knowledge bases.
We use Stanford NER was used for the NER
and in-house end-to-end entity linking soft-
ware was applied for entity linking.
USFD (Derczynski et al., 2015a) Feature extrac-
tion was based on large Brown clusters,
gazetteers tuned to the input data, and distant
supervision from Freebase. The representa-
tion was tuned for drift by down-weighting
temporally distant training examples. The
classifier was a linear chain CRF with hyper-
parameters tuned for Twitter.
</bodyText>
<sectionHeader confidence="0.996703" genericHeader="conclusions">
4 Summary
</sectionHeader>
<bodyText confidence="0.9999514">
In this paper, we presented two shared tasks on
Twitter text processing: Lexical Normalization
and Named Entity Recognition. We detailed the
task setup and datasets used in the respective
shared tasks, and also outlined the approach taken
by the participating systems. Both shared tasks
were of a scale substantially larger than what had
previously been attempted in the literature, with
two primary benefits. First, we are able to draw
stronger conclusions about the true potential of
different approaches. Second, through analyzing
the results of the participating systems, we are able
to suggest potential research directions for both fu-
ture shared tasks and noisy text processing in gen-
eral.
</bodyText>
<sectionHeader confidence="0.997638" genericHeader="acknowledgments">
Acknowledgments
</sectionHeader>
<bodyText confidence="0.999992571428571">
We would like to thank Svitlana Volkova and Jun-
ming Xu for feedback on a previous draft of this
paper. We also thank Javier Angel and Gabriella
Talvy for annotating the test data for the named
entity recognition shared task, and IBM Research
Australia for the generous support in doing the an-
notation for the lexical normalization shared task.
</bodyText>
<sectionHeader confidence="0.998979" genericHeader="references">
References
</sectionHeader>
<reference confidence="0.990453714285714">
Md Shad Akhtar, Utpal Kumar Sikdar, and Asif Ek-
bal. 2015a. Iitp: Multiobjective differential evolu-
tion based twitter named entity recognition. In pro-
ceedings of WNUT.
Md Shad Akhtar, Utpal Kumar Sikdar, and Asif Ekbal.
2015b. Iitp: Hybrid approach for text normalization
in twitter. In proceedings of WNUT, Beijing, China.
</reference>
<page confidence="0.995628">
133
</page>
<reference confidence="0.998728772727273">
AiTi Aw, Min Zhang, Juan Xiao, and Jian Su. 2006.
A phrase-based statistical model for SMS text nor-
malization. In Proceedings of COLING/ACL 2006,
pages 33–40, Sydney, Australia.
Russell Beckley. 2015. Bekli:a simple approach to
twitter text normalization. In proceedings of WNUT,
Beijing, China.
Gabor Berend and Ervin Tasn´adi. 2015. Uszeged:
Correction type-sensitive normalization of english
tweets using efficiently indexed n-gram statistics. In
proceedings of WNUT, Beijing, China.
Kurt Bollacker, Colin Evans, Praveen Paritosh, Tim
Sturge, and Jamie Taylor. 2008. Freebase: a col-
laboratively created graph database for structuring
human knowledge. In Proceedings of the 2008 ACM
SIGMOD international conference on Management
of data, pages 1247–1250. ACM.
Colin Cherry and Hongyu Guo. 2015. The unreason-
able effectiveness of word representations for twitter
named entity recognition. NAACL.
Colin Cherry, Hongyu Guo, and Chengbi Dai. 2015.
Nrc: Infused phrase vectors and updated gazetteers
for named entity recognition in twitter. In proceed-
ings of WNUT.
Grzegorz Chrupała. 2014. Normalizing tweets with
edit scripts and recurrent neural embeddings. In
Proceedings of the 52nd Annual Meeting of the
Association for Computational Linguistics (ACL
2014), pages 680–686, Baltimore, USA, June.
Paul Cook and Suzanne Stevenson. 2009. An unsu-
pervised model for text message normalization. In
Proceedings of the Workshop on Computational Ap-
proaches to Linguistic Creativity (CALC ’09), pages
71–78, Boulder, USA.
Leon Derczynski, Isabelle Augenstein, and Kalina
Bontcheva. 2015a. Usfd: Twitter ner with drift
compensation and linked data. In proceedings of
WNUT.
Leon Derczynski, Diana Maynard, Giuseppe Rizzo,
Marieke van Erp, Genevieve Gorrell, Rapha¨el
Troncy, Johann Petrak, and Kalina Bontcheva.
2015b. Analysis of named entity recognition and
linking for tweets. Information Processing &amp; Man-
agement, 51(2):32–49.
Mark Dredze, Tim Oates, and Christine Piatko. 2010.
We’re not in kansas anymore: detecting domain
changes in streams. In Proceedings of the 2010 Con-
ference on Empirical Methods in Natural Language
Processing, pages 585–595. Association for Com-
putational Linguistics.
Hege Fromreide, Dirk Hovy, and Anders Søgaard.
2014. Crowdsourcing and annotating ner for twit-
ter# drift. European language resources distribution
agency.
Fr´ederic Godin, Baptist Vandersmissen, Wesley
De Neve, and Rik Van de Walle. 2015. multime-
dialab @ acl wnut ner shared task: Named entity
recognition for twitter microposts using distributed
word representations. In proceedings of WNUT.
Bo Han and Timothy Baldwin. 2011. Lexical normal-
isation of short text messages: Makn sens a #twit-
ter. In Proceedings of the 49th Annual Meeting
of the Association for Computational Linguistics:
Human Language Technologies (ACL HLT 2011),
pages 368–378, Portland, USA.
Bo Han, Paul Cook, and Timothy Baldwin. 2012. Au-
tomatically constructing a normalisation dictionary
for microblogs. In Proceedings of the Joint Con-
ference on Empirical Methods in Natural Language
Processing and Computational Natural Language
Learning 2012 (EMNLP-CoNLL 2012), pages 421–
432, Jeju Island, Korea, July.
Bo Han, Paul Cook, and Timothy Baldwin. 2013.
Lexical normalisation for social media text. ACM
Transactions on Intelligent Systems and Technology,
4(1):5:1–5:27.
Hany Hassan and Arul Menezes. 2013. Social text nor-
malization using contextual graph random walks. In
Proceedings of the 51st Annual Meeting of the Asso-
ciation for Computational Linguistics (ACL 2013),
pages 1577–1586, Sofia, Bulgaria, August.
Ning Jin. 2015. Ncsu-sas-ning: Candidate genera-
tion and feature engineering for supervised lexical
normalization. In proceedings of WNUT, Beijing,
China.
Lingpeng Kong, Nathan Schneider, Swabha
Swayamdipta, Archna Bhatia, Chris Dyer, and
A. Noah Smith. 2014. A dependency parser for
tweets. In Proceedings of the 2014 Conference on
Empirical Methods in Natural Language Processing
(EMNLP), pages 1001–1012.
John D. Lafferty, Andrew McCallum, and Fernando
C. N. Pereira. 2001. Conditional random fields:
Probabilistic models for segmenting and labeling se-
quence data. In Proceedings of the Eighteenth Inter-
national Conference on Machine Learning.
Samuel Leeman-Munk, James Lester, and James Cox.
2015. Ncsu sas sam: Deep encoding and recon-
struction for normalization of noisy text. In proceed-
ings of WNUT, Beijing, China.
Wang Ling, Chris Dyer, Alan W Black, and Isabel
Trancoso. 2013. Paraphrasing 4 microblog normal-
ization. In Proceedings of the 2013 Conference on
Empirical Methods in Natural Language Processing
(EMNLP 2013), pages 73–84, Seattle, USA, Octo-
ber.
Fei Liu, Fuliang Weng, Bingqing Wang, and Yang Liu.
2011. Insertion, deletion, or substitution? Nor-
malizing text messages without pre-categorization
nor supervision. In Proceedings of the 49th Annual
</reference>
<page confidence="0.986227">
134
</page>
<reference confidence="0.999739272727273">
Meeting of the Association for Computational Lin-
guistics: Human Language Technologies (ACL HLT
2011), pages 71–76, Portland, USA.
Fei Liu, Fuliang Weng, and Xiao Jiang. 2012. A
broad-coverage normalization system for social me-
dia language. In Proceedings of the 50th Annual
Meeting of the Association for Computational Lin-
guistics (ACL 2012), pages 1035–1044, Jeju Island,
Korea, July.
Marco Lui and Timothy Baldwin. 2012. langid.py: An
off-the-shelf language identification tool. In Pro-
ceedings of the 50th Annual Meeting of the Asso-
ciation for Computational Linguistics (ACL 2012)
Demo Session, pages 25–30, Jeju, Republic of Ko-
rea.
Wookhee Min, Bradford Mott, James Lester, and James
Cox. 2015. Ncsu sas wookhee: A deep contextual
long-short term memory model for text normaliza-
tion. In proceedings of WNUT, Beijing, China.
Yerai Doval Mosquera, Jes´us Vilares, and Carlos
G´omez-Rodriguez. 2015. Lysgroup: Adapting a
spanish microtext normalization system to english.
In proceedings of WNUT, Beijing, China.
Barbara Plank, Dirk Hovy, Ryan McDonald, and An-
ders Søgaard. 2014a. Adapting taggers to twit-
ter with not-so-distant supervision. In Proceedings
of COLING 2014, the 25th International Confer-
ence on Computational Linguistics: Technical Pa-
pers, pages 1783–1792.
Barbara Plank, Dirk Hovy, and Anders Søgaard.
2014b. Learning part-of-speech taggers with inter-
annotator agreement loss. In Proceedings of EACL.
Alan Ritter, Sam Clark, Oren Etzioni, et al. 2011.
Named entity recognition in tweets: an experimental
study. In Proceedings of the Conference on Empiri-
cal Methods in Natural Language Processing, pages
1524–1534. Association for Computational Linguis-
tics.
Richard Sproat, Alan W. Black, Stanley Chen, Shankar
Kumar, Mari Ostendorf, and Christopher Richards.
2001. Normalization of non-standard words. Com-
puter Speech and Language, 15(3):287–333.
Dmitry Supranovich and Viachaslau Patsepnia. 2015.
Ihs rd: Lexical normalization for english tweets. In
proceedings of WNUT, Beijing, China.
Tian Tian. 2015. Data adaptation for named entity
recognition on tweets with features-rich crf. In pro-
ceedings of WNUT.
Erik F Tjong Kim Sang and Fien De Meulder.
2003. Introduction to the conll-2003 shared task:
Language-independent named entity recognition. In
Proceedings of the seventh conference on Natural
language learning at HLT-NAACL 2003-Volume 4,
pages 142–147. Association for Computational Lin-
guistics.
Zhiqiang Toh, Bin Chen, and Jian Su. 2015. Improv-
ing twitter named entity recognition using word rep-
resentations. In proceedings of WNUT.
Joseph Turian, Lev Ratinov, and Yoshua Bengio. 2010.
Word representations: a simple and general method
for semi-supervised learning. In Proceedings of the
48th annual meeting of the association for compu-
tational linguistics, pages 384–394. Association for
Computational Linguistics.
Joachim Wagner and Jennifer Foster. 2015. Dcu-
adapt: Learning edit operations for microblog nor-
malisation with the generalised perceptron. In pro-
ceedings of WNUT, Beijing, China.
Pidong Wang and Hwee Tou Ng. 2013. A beam-
search decoder for normalization of social media
text with application to machine translation. In
Proceedings of the 2013 Conference of the North
American Chapter of the Association for Computa-
tional Linguistics: Human Language Technologies
(NAACL HLT 2013), pages 471–481, Atlanta, USA,
June.
Ikuya Yamada, Hideaki Takeda, and Takefuji
Yoshiyasu. 2015. Enhancing named entity
recognition in twitter messages using entity linking.
In proceedings of WNUT.
Yi Yang and Jacob Eisenstein. 2013. A log-linear
model for unsupervised text normalization. In
Proceedings of the 2013 Conference on Empirical
Methods in Natural Language Processing (EMNLP
2013), pages 61–72, Seattle, USA, October.
Eun-Suk Yang and Yu-Seop Kim. 2015. Hallym:
Named entity recognition on twitter. In proceedings
of WNUT.
</reference>
<page confidence="0.998782">
135
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.470577">
<title confidence="0.975393">Shared Tasks of the 2015 Workshop on Noisy User-generated Twitter Lexical Normalization and Named Entity Recognition</title>
<author confidence="0.998477">Timothy</author>
<affiliation confidence="0.999652">University of Melbourne</affiliation>
<email confidence="0.961391">tb@ldwin.net</email>
<affiliation confidence="0.770729">Young-Bum University of Wisconsin</affiliation>
<email confidence="0.997683">ybkim@cs.wisc.edu</email>
<author confidence="0.997241">Marie Catherine de</author>
<affiliation confidence="0.997251">The Ohio State University</affiliation>
<email confidence="0.995756">demarneffe.1@osu.edu</email>
<author confidence="0.99998">Alan Ritter</author>
<affiliation confidence="0.99985">The Ohio State University</affiliation>
<email confidence="0.992633">ritter.1492@osu.edu</email>
<author confidence="0.999979">Bo Han</author>
<affiliation confidence="0.999868">IBM Research</affiliation>
<email confidence="0.994322">bohan.ibm@au1.ibm.com</email>
<author confidence="0.999668">Wei Xu</author>
<affiliation confidence="0.999912">University of Pennsylvania</affiliation>
<email confidence="0.99979">xwe@cis.upenn.edu</email>
<abstract confidence="0.9989527">This paper presents the results of the two shared tasks associated with W-NUT 2015: (1) a text normalization task with 10 participants; and (2) a named entity tagging task with 8 participants. We outline the task, annotation process and dataset statistics, and provide a high-level overview of the participating systems for each shared task.</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="false">
<authors>
<author>Md Shad</author>
</authors>
<title>Akhtar, Utpal Kumar Sikdar, and Asif Ekbal. 2015a. Iitp: Multiobjective differential evolution based twitter named entity recognition.</title>
<booktitle>In proceedings of WNUT.</booktitle>
<marker>Shad, </marker>
<rawString>Md Shad Akhtar, Utpal Kumar Sikdar, and Asif Ekbal. 2015a. Iitp: Multiobjective differential evolution based twitter named entity recognition. In proceedings of WNUT.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Md Shad Akhtar</author>
<author>Utpal Kumar Sikdar</author>
<author>Asif Ekbal</author>
</authors>
<title>Iitp: Hybrid approach for text normalization in twitter.</title>
<date>2015</date>
<booktitle>In proceedings of WNUT,</booktitle>
<location>Beijing, China.</location>
<contexts>
<context position="13129" citStr="Akhtar et al., 2015" startWordPosition="2047" endWordPosition="2050">SAS WOOKHEE (Min et al., 2015) Word-level edits are predicted based on long-short term memory (LSTM) recurrent neural networks (RNN), using character sequences and POS tags as features. The LSTM is further complemented with a normalization lexicon induced from the training data. NCSU SAS SAM (Leeman-Munk et al., 2015) Two forward feed neural networks are used to predict: (1) the normalized token given an input token; and (2) whether a word should be normalized or left intact. Normalized tokens are further edited by a “conformer” which down-weights rare words as normalization candidates. IITP (Akhtar et al., 2015b) A CRF model is trained over the training data, with features including word sequences, POS tags and morphology features. Post-processing heuristics are used to post-edit the output of the CRF. 5One team (GIGO) didn’t submit a description paper. DCU-ADAPT (Wagner and Foster, 2015) A generalized perceptron method is used generate word edit operations, with features including character n-gram[ s], character classes, and RNN language model hidden layer activation features. The final normalization word is selected based on the noisy channel model with a character language model. IHD RD (Supranov</context>
<context position="24046" citStr="Akhtar et al., 2015" startWordPosition="3792" endWordPosition="3795">4 38.58 44.74 USFD 63.81 56.28 59.81 multimedialab 49.52 39.18 43.75 multimedialab 62.93 55.22 58.82 USFD 45.72 39.64 42.46 nrc 62.13 54.61 58.13 iitp 60.68 29.65 39.84 iitp 63.43 51.44 56.81 Hallym 39.59 35.10 37.21 Hallym 58.36 48.5 53.01 lattice 55.17 9.68 16.47 lattice 58.42 25.72 35.71 BASELINE 35.56 29.05 31.97 BASELINE 53.86 46.44 49.88 Table 8: Results segmenting and categorizing entities into 10 types. Hallym (Yang and Kim, 2015) The Hallym team used an approach based on CRFs using both Brown clusters and word embeddings trained using Canonical Correlation Analysis as features. iitp (Akhtar et al., 2015a) The iitp team proTable 9: Results on segmentation only (no types). posed a multi-objective differential evolution based technique for feature selection in twitter named entity recognition. lattice (Tian, 2015) Lattice employed a CRF model using Wapiti. The feature templates consisted of standard features used in stateof-the-art. They trained first a model with 132 dev 2015 and evaluated this model on train and dev. multimedialab (Godin et al., 2015) The goal of the multimedia lab system was to only use neural networks and word embeddings to show the power of automatic feature learning and s</context>
</contexts>
<marker>Akhtar, Sikdar, Ekbal, 2015</marker>
<rawString>Md Shad Akhtar, Utpal Kumar Sikdar, and Asif Ekbal. 2015b. Iitp: Hybrid approach for text normalization in twitter. In proceedings of WNUT, Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>AiTi Aw</author>
<author>Min Zhang</author>
<author>Juan Xiao</author>
<author>Jian Su</author>
</authors>
<title>A phrase-based statistical model for SMS text normalization.</title>
<date>2006</date>
<booktitle>In Proceedings of COLING/ACL</booktitle>
<pages>33--40</pages>
<location>Sydney, Australia.</location>
<contexts>
<context position="4257" citStr="Aw et al., 2006" startWordPosition="635" endWordPosition="638">have been trained on edited text. Text normalization over Twitter data has been addressed at different granularities. For instance, non-standard words can be considered as spelling errors at the character (Liu et al., 2011) or word level (Wang and Ng, 2013). Text normalization can also be approached as a machine 126 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 126–135, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics Figure 1: Normalization examples translation task, whereby non-standard words are mapped to more canonical expressions (Aw et al., 2006). Other approaches have involved deep learning (Chrupała, 2014), cognitively-inspired approaches (Liu et al., 2012), random walks (Hassan and Menezes, 2013), and supervision using automatically-mined parallel data (Ling et al., 2013). One major challenge in text normalization research has been the lack of annotated data for training and evaluating methods. As a result, most Twitter text normalization methods have been unsupervised or semi-supervised (Cook and Stevenson, 2009; Han et al., 2012; Yang and Eisenstein, 2013), and evaluated over small-scale handannotated datasets. This has hampered </context>
</contexts>
<marker>Aw, Zhang, Xiao, Su, 2006</marker>
<rawString>AiTi Aw, Min Zhang, Juan Xiao, and Jian Su. 2006. A phrase-based statistical model for SMS text normalization. In Proceedings of COLING/ACL 2006, pages 33–40, Sydney, Australia.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Russell Beckley</author>
</authors>
<title>Bekli:a simple approach to twitter text normalization.</title>
<date>2015</date>
<booktitle>In proceedings of WNUT,</booktitle>
<location>Beijing, China.</location>
<contexts>
<context position="14399" citStr="Beckley, 2015" startWordPosition="2238" endWordPosition="2239">ed using a CRF tagger, using features such as token-level features, contextual tokens, dictionary lookup, and edit distance. Multiple lexicons are combined to generate normalization candidates. A query misspelling correction module (i.e., DidYouMean) is used to post-process the output. USZEGED (Berend and Tasn´adi, 2015) A CRF model is used to identify tokens requiring normalization, and determine the type of normalization required. Normalization candidates are then proposed based on revised edit distance. The final normalization candidate is selected on the basis of n-grams tatistics. BEKLI (Beckley, 2015) A substitution dictionary is constructed in which keys are non-standard words and values are lists of potential normalizations. Frequent morphology errors are captured by hand-crafted rules. Finally, the Viterbi algorithm is applied to bigram sequences to decode the normalized sentence with maximum probability. LYSGROUP (Mosquera et al., 2015) A system originally developed for Spanish text normalization was adapted to English text normalization. The method consists of a cascaded pipeline of several data adaptors and processors, such as a Twitter POS tagger and a spell checker. 3 Named Entity </context>
</contexts>
<marker>Beckley, 2015</marker>
<rawString>Russell Beckley. 2015. Bekli:a simple approach to twitter text normalization. In proceedings of WNUT, Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Gabor Berend</author>
<author>Ervin Tasn´adi</author>
</authors>
<title>Uszeged: Correction type-sensitive normalization of english tweets using efficiently indexed n-gram statistics.</title>
<date>2015</date>
<booktitle>In proceedings of WNUT,</booktitle>
<location>Beijing, China.</location>
<marker>Berend, Tasn´adi, 2015</marker>
<rawString>Gabor Berend and Ervin Tasn´adi. 2015. Uszeged: Correction type-sensitive normalization of english tweets using efficiently indexed n-gram statistics. In proceedings of WNUT, Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Kurt Bollacker</author>
<author>Colin Evans</author>
<author>Praveen Paritosh</author>
<author>Tim Sturge</author>
<author>Jamie Taylor</author>
</authors>
<title>Freebase: a collaboratively created graph database for structuring human knowledge.</title>
<date>2008</date>
<booktitle>In Proceedings of the 2008 ACM SIGMOD international conference on Management of data,</booktitle>
<pages>1247--1250</pages>
<publisher>ACM.</publisher>
<contexts>
<context position="20246" citStr="Bollacker et al., 2008" startWordPosition="3170" endWordPosition="3173">or’s labels as gold and the other’s as predicted. This exposes the challenging nature of this annotation task and can be viewed as a kind of human upper bound on possible system performance, though we believe the consistency of the final annotations to be somewhat higher due to the second pass made by the adjudicator. The value of Cohen’s r. as measured on word-level annotations is 0.607. A baseline system was provided to participants which takes a simple approach based on CRFsuite8 using a standard set of features which include contextual, orthographic and gazetteers generated from Freebase (Bollacker et al., 2008). The evaluation consisted of 2 sub-tasks: one in which participants’ systems were required to segment and classify 10 named entity types and one where the task is only to predict entity segmentation (no types). 3.3 Approaches Eight teams (Table 6) participated in the named entity recognition shared task. A wide variety of approaches were taken to tackle this task. Table 7 summarizes the features used by each team and the machine learning approach taken. Many teams made use of word embeddings and Brown clusters as features. One team (multimedialab) used absolutely no hand-engineered features, </context>
</contexts>
<marker>Bollacker, Evans, Paritosh, Sturge, Taylor, 2008</marker>
<rawString>Kurt Bollacker, Colin Evans, Praveen Paritosh, Tim Sturge, and Jamie Taylor. 2008. Freebase: a collaboratively created graph database for structuring human knowledge. In Proceedings of the 2008 ACM SIGMOD international conference on Management of data, pages 1247–1250. ACM.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Colin Cherry</author>
<author>Hongyu Guo</author>
</authors>
<title>The unreasonable effectiveness of word representations for twitter named entity recognition.</title>
<date>2015</date>
<publisher>NAACL.</publisher>
<contexts>
<context position="17210" citStr="Cherry and Guo, 2015" startWordPosition="2674" endWordPosition="2677"> Fromreide et al., 2014); the distribution of language and topics on Twitter is constantly shifting leading to degraded performance of NLP tools over time. To evaluate the effect of drift in a realistic scenario, the current evaluation uses a test set from a separate time period, which was not announced to participants until the (unannotated) test data was released at the beginning of the evaluation period. To address these challenges, there has been an increasing body of work on adapting named entity recognition tools to noisy social media text (Derczynski et al., 2015b; Plank et al., 2014a; Cherry and Guo, 2015; Ritter et al., 2011; Plank et al., 2014b), however different research groups have made use of different evaluation setups (e.g. training / test splits) making it challenging to perform direct comparisons across systems. By organizing a shared evaluation we hope to help establish a common evaluation methodology (for at least one dataset) and also promote research and development of NLP tools for user-generated social media text genres. 3.1 Training and Development Data The training and development data for our task was taken from previous work on Twitter NER (Ritter et al., 2011), which disti</context>
<context position="21084" citStr="Cherry and Guo, 2015" startWordPosition="3303" endWordPosition="3306">pproaches Eight teams (Table 6) participated in the named entity recognition shared task. A wide variety of approaches were taken to tackle this task. Table 7 summarizes the features used by each team and the machine learning approach taken. Many teams made use of word embeddings and Brown clusters as features. One team (multimedialab) used absolutely no hand-engineered features, relying entirely on word embeddings and a feed-forward neural-network (FFNN) architecture (Godin et al., 2015). Other new approaches to Twitter NER include a semi-Markov MIRA trained tagger developed by the NRC team (Cherry and Guo, 2015) and the use of entity-linking based features by ou8http://www.chokkan.org/software/ crfsuite/ Precision Recall F#_1 company 41.46 33.33 36.96 facility 50.00 66.67 57.14 geo-loc 63.57 70.09 66.67 movie 35.71 35.71 35.71 musicartist 60.98 47.17 53.19 other 48.21 50.00 49.09 person 60.42 80.56 69.05 product 44.83 19.12 26.80 sportsteam 75.00 71.74 73.33 tvshow 55.56 50.00 52.63 Overall 56.64 57.52 57.07 Table 5: Precision and recall comparing one annotator against the other. Cohen’s kappa between the annotators was 0.607. Disagreements between the annotators resolved by a 3rd adjudicator for the</context>
</contexts>
<marker>Cherry, Guo, 2015</marker>
<rawString>Colin Cherry and Hongyu Guo. 2015. The unreasonable effectiveness of word representations for twitter named entity recognition. NAACL.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Colin Cherry</author>
<author>Hongyu Guo</author>
<author>Chengbi Dai</author>
</authors>
<title>Nrc: Infused phrase vectors and updated gazetteers for named entity recognition in twitter.</title>
<date>2015</date>
<booktitle>In proceedings of WNUT.</booktitle>
<contexts>
<context position="22818" citStr="Cherry et al., 2015" startWordPosition="3577" endWordPosition="3580">btain an optimal feature set. Most systems used the training data as well as both dev sets provided to train their system, except multimedialab which did not use dev2015 as training data and NRC which only used train. 9 Tables 8 and 9 report the results obtained by each team for segmentation and classification of the 10 named entity types and for segmentation only, respectively. 3.4 System Descriptions Following is a brief description of the approach taken by each team: 9A post-competition analysis of the effect of training on development sets is presented in the NRC system description paper (Cherry et al., 2015). 131 Figure 2: Annotation interface. POS Orthographic Gazetteers Brown clustering Word embedding ML BASELINE – ✓ ✓ – – CRFsuite Hallym ✓ – – ✓ correlation analysis CRFsuite iitp ✓ ✓ ✓ – – CRF++ lattice ✓ ✓ – ✓ – CRF wapiti multimedialab – – – – word2vec FFNN NLANGP – ✓ ✓ ✓ word2vec &amp; GloVe CRF++ nrc – – ✓ ✓ word2vec semi-Markov MIRA ousia ✓ ✓ ✓ – ✓ entity linking USFD ✓ ✓ ✓ ✓ CRF L-BFGS Table 7: Features and machine learning approach taken by each team. Precision Recall F#=1 Precision Recall F#=1 ousia 57.66 55.22 56.41 ousia 72.20 69.14 70.63 NLANGP 63.62 43.12 51.40 NLANGP 67.74 54.31 60.29</context>
<context position="25662" citStr="Cherry et al., 2015" startWordPosition="4062" endWordPosition="4065">rd. Afterwards, a rule-based post-processing step was executed to ensure every I-tag has a B-tag in front of it and that all tags within a single span are of the same type. Train and dev were used as training data and used dev 2015 as validation set. NLANGP (Toh et al., 2015) The NLANGP team modeled the problem as a sequential labeling task and used Conditional Random Fields. Several post-processing steps (e.g. rulebased matching) were applied to refine the system output. Besides Brown clusters, Kmeans clusters were also used; the K-means clusters were generated based on word embeddings. nrc (Cherry et al., 2015) NRC applied a MIRAtrained semi-Markov tagger with Gazetteer, Brown cluster and Word Embedding features. The Word Embeddings were built over phrases using Word2Vec’s phrase finder tool, and were modified using an auto-encoder to be predictive of Gazetteer membership. ousia (Yamada et al., 2015) The main characteristics of the ousia method is enhancing the performance of Twitter named entity recognition using entity linking. Once entity mentions are disambiguated to the knowledge base entries, high-quality knowledge can be easily extracted from a knowledge base such as the popularity of the ent</context>
</contexts>
<marker>Cherry, Guo, Dai, 2015</marker>
<rawString>Colin Cherry, Hongyu Guo, and Chengbi Dai. 2015. Nrc: Infused phrase vectors and updated gazetteers for named entity recognition in twitter. In proceedings of WNUT.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Grzegorz Chrupała</author>
</authors>
<title>Normalizing tweets with edit scripts and recurrent neural embeddings.</title>
<date>2014</date>
<booktitle>In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (ACL 2014),</booktitle>
<pages>680--686</pages>
<location>Baltimore, USA,</location>
<contexts>
<context position="4320" citStr="Chrupała, 2014" startWordPosition="645" endWordPosition="646">r data has been addressed at different granularities. For instance, non-standard words can be considered as spelling errors at the character (Liu et al., 2011) or word level (Wang and Ng, 2013). Text normalization can also be approached as a machine 126 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 126–135, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics Figure 1: Normalization examples translation task, whereby non-standard words are mapped to more canonical expressions (Aw et al., 2006). Other approaches have involved deep learning (Chrupała, 2014), cognitively-inspired approaches (Liu et al., 2012), random walks (Hassan and Menezes, 2013), and supervision using automatically-mined parallel data (Ling et al., 2013). One major challenge in text normalization research has been the lack of annotated data for training and evaluating methods. As a result, most Twitter text normalization methods have been unsupervised or semi-supervised (Cook and Stevenson, 2009; Han et al., 2012; Yang and Eisenstein, 2013), and evaluated over small-scale handannotated datasets. This has hampered analysis of the strengths and weaknesses of individual methods,</context>
</contexts>
<marker>Chrupała, 2014</marker>
<rawString>Grzegorz Chrupała. 2014. Normalizing tweets with edit scripts and recurrent neural embeddings. In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (ACL 2014), pages 680–686, Baltimore, USA, June.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Paul Cook</author>
<author>Suzanne Stevenson</author>
</authors>
<title>An unsupervised model for text message normalization.</title>
<date>2009</date>
<booktitle>In Proceedings of the Workshop on Computational Approaches to Linguistic Creativity (CALC ’09),</booktitle>
<pages>71--78</pages>
<location>Boulder, USA.</location>
<contexts>
<context position="4736" citStr="Cook and Stevenson, 2009" startWordPosition="705" endWordPosition="709">guistics Figure 1: Normalization examples translation task, whereby non-standard words are mapped to more canonical expressions (Aw et al., 2006). Other approaches have involved deep learning (Chrupała, 2014), cognitively-inspired approaches (Liu et al., 2012), random walks (Hassan and Menezes, 2013), and supervision using automatically-mined parallel data (Ling et al., 2013). One major challenge in text normalization research has been the lack of annotated data for training and evaluating methods. As a result, most Twitter text normalization methods have been unsupervised or semi-supervised (Cook and Stevenson, 2009; Han et al., 2012; Yang and Eisenstein, 2013), and evaluated over small-scale handannotated datasets. This has hampered analysis of the strengths and weaknesses of individual methods, and was our motivation in organizing the lexical normalization shared task. 2.2 Shared Task Design This lexical normalization shared task is focused exclusively on English, and was designed with three primary desiderata in mind: (1) to construct a much larger dataset than existing resources; (2) to allow all of 1:1, 1:N and N:1 word n-gramm appings; and (3) to cover not just OOV non-standard words but also non-s</context>
</contexts>
<marker>Cook, Stevenson, 2009</marker>
<rawString>Paul Cook and Suzanne Stevenson. 2009. An unsupervised model for text message normalization. In Proceedings of the Workshop on Computational Approaches to Linguistic Creativity (CALC ’09), pages 71–78, Boulder, USA.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Leon Derczynski</author>
<author>Isabelle Augenstein</author>
<author>Kalina Bontcheva</author>
</authors>
<title>Usfd: Twitter ner with drift compensation and linked data.</title>
<date>2015</date>
<booktitle>In proceedings of WNUT.</booktitle>
<contexts>
<context position="17166" citStr="Derczynski et al., 2015" startWordPosition="2665" endWordPosition="2669">hallenge is concept drift (Dredze et al., 2010; Fromreide et al., 2014); the distribution of language and topics on Twitter is constantly shifting leading to degraded performance of NLP tools over time. To evaluate the effect of drift in a realistic scenario, the current evaluation uses a test set from a separate time period, which was not announced to participants until the (unannotated) test data was released at the beginning of the evaluation period. To address these challenges, there has been an increasing body of work on adapting named entity recognition tools to noisy social media text (Derczynski et al., 2015b; Plank et al., 2014a; Cherry and Guo, 2015; Ritter et al., 2011; Plank et al., 2014b), however different research groups have made use of different evaluation setups (e.g. training / test splits) making it challenging to perform direct comparisons across systems. By organizing a shared evaluation we hope to help establish a common evaluation methodology (for at least one dataset) and also promote research and development of NLP tools for user-generated social media text genres. 3.1 Training and Development Data The training and development data for our task was taken from previous work on Tw</context>
<context position="26651" citStr="Derczynski et al., 2015" startWordPosition="4219" endWordPosition="4222">nce of Twitter named entity recognition using entity linking. Once entity mentions are disambiguated to the knowledge base entries, high-quality knowledge can be easily extracted from a knowledge base such as the popularity of the entity, the classes of the entity, and the likelihood that the entity appears in the given context. They adopted supervised machine-learning with features including the results of NER and various information of the entity in knowledge bases. We use Stanford NER was used for the NER and in-house end-to-end entity linking software was applied for entity linking. USFD (Derczynski et al., 2015a) Feature extraction was based on large Brown clusters, gazetteers tuned to the input data, and distant supervision from Freebase. The representation was tuned for drift by down-weighting temporally distant training examples. The classifier was a linear chain CRF with hyperparameters tuned for Twitter. 4 Summary In this paper, we presented two shared tasks on Twitter text processing: Lexical Normalization and Named Entity Recognition. We detailed the task setup and datasets used in the respective shared tasks, and also outlined the approach taken by the participating systems. Both shared task</context>
</contexts>
<marker>Derczynski, Augenstein, Bontcheva, 2015</marker>
<rawString>Leon Derczynski, Isabelle Augenstein, and Kalina Bontcheva. 2015a. Usfd: Twitter ner with drift compensation and linked data. In proceedings of WNUT.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Leon Derczynski</author>
<author>Diana Maynard</author>
<author>Giuseppe Rizzo</author>
<author>Marieke van Erp</author>
<author>Genevieve Gorrell</author>
<author>Johann Petrak Rapha¨el Troncy</author>
<author>Kalina Bontcheva</author>
</authors>
<title>Analysis of named entity recognition and linking for tweets.</title>
<date>2015</date>
<journal>Information Processing &amp; Management,</journal>
<volume>51</volume>
<issue>2</issue>
<marker>Derczynski, Maynard, Rizzo, van Erp, Gorrell, Rapha¨el Troncy, Bontcheva, 2015</marker>
<rawString>Leon Derczynski, Diana Maynard, Giuseppe Rizzo, Marieke van Erp, Genevieve Gorrell, Rapha¨el Troncy, Johann Petrak, and Kalina Bontcheva. 2015b. Analysis of named entity recognition and linking for tweets. Information Processing &amp; Management, 51(2):32–49.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Mark Dredze</author>
<author>Tim Oates</author>
<author>Christine Piatko</author>
</authors>
<title>We’re not in kansas anymore: detecting domain changes in streams.</title>
<date>2010</date>
<booktitle>In Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing,</booktitle>
<pages>585--595</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="16589" citStr="Dredze et al., 2010" startWordPosition="2570" endWordPosition="2573"> DidYouMean USZEGED 0.8606 0.7564 0.8052 CRF + n-gram[ s] BEKLI 0.7743 0.7416 0.7571 Lexicon + Rule + Ranker GIGO 0.7593 0.6963 0.7264 N/A LYSGROUP 0.4592 0.6296 0.5310 Spanish Normalization Adaption Table 4: Results of the unconstrained systems for the lexical normalization shared task of-the-art performance on Twitter data lags far behind. The diverse and noisy style of user-generated content presents serious challenges. For instance tweets, unlike edited newswire text, contain numerous nonstandard spellings, abbreviations, unreliable capitalization, etc. Another challenge is concept drift (Dredze et al., 2010; Fromreide et al., 2014); the distribution of language and topics on Twitter is constantly shifting leading to degraded performance of NLP tools over time. To evaluate the effect of drift in a realistic scenario, the current evaluation uses a test set from a separate time period, which was not announced to participants until the (unannotated) test data was released at the beginning of the evaluation period. To address these challenges, there has been an increasing body of work on adapting named entity recognition tools to noisy social media text (Derczynski et al., 2015b; Plank et al., 2014a;</context>
</contexts>
<marker>Dredze, Oates, Piatko, 2010</marker>
<rawString>Mark Dredze, Tim Oates, and Christine Piatko. 2010. We’re not in kansas anymore: detecting domain changes in streams. In Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing, pages 585–595. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Hege Fromreide</author>
<author>Dirk Hovy</author>
<author>Anders Søgaard</author>
</authors>
<title>Crowdsourcing and annotating ner for twitter# drift. European language resources distribution agency.</title>
<date>2014</date>
<contexts>
<context position="16614" citStr="Fromreide et al., 2014" startWordPosition="2574" endWordPosition="2577">.8606 0.7564 0.8052 CRF + n-gram[ s] BEKLI 0.7743 0.7416 0.7571 Lexicon + Rule + Ranker GIGO 0.7593 0.6963 0.7264 N/A LYSGROUP 0.4592 0.6296 0.5310 Spanish Normalization Adaption Table 4: Results of the unconstrained systems for the lexical normalization shared task of-the-art performance on Twitter data lags far behind. The diverse and noisy style of user-generated content presents serious challenges. For instance tweets, unlike edited newswire text, contain numerous nonstandard spellings, abbreviations, unreliable capitalization, etc. Another challenge is concept drift (Dredze et al., 2010; Fromreide et al., 2014); the distribution of language and topics on Twitter is constantly shifting leading to degraded performance of NLP tools over time. To evaluate the effect of drift in a realistic scenario, the current evaluation uses a test set from a separate time period, which was not announced to participants until the (unannotated) test data was released at the beginning of the evaluation period. To address these challenges, there has been an increasing body of work on adapting named entity recognition tools to noisy social media text (Derczynski et al., 2015b; Plank et al., 2014a; Cherry and Guo, 2015; Ri</context>
</contexts>
<marker>Fromreide, Hovy, Søgaard, 2014</marker>
<rawString>Hege Fromreide, Dirk Hovy, and Anders Søgaard. 2014. Crowdsourcing and annotating ner for twitter# drift. European language resources distribution agency.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Fr´ederic Godin</author>
<author>Baptist Vandersmissen</author>
<author>Wesley De Neve</author>
<author>Rik Van de Walle</author>
</authors>
<title>multimedialab @ acl wnut ner shared task: Named entity recognition for twitter microposts using distributed word representations.</title>
<date>2015</date>
<booktitle>In proceedings of WNUT.</booktitle>
<marker>Godin, Vandersmissen, De Neve, Van de Walle, 2015</marker>
<rawString>Fr´ederic Godin, Baptist Vandersmissen, Wesley De Neve, and Rik Van de Walle. 2015. multimedialab @ acl wnut ner shared task: Named entity recognition for twitter microposts using distributed word representations. In proceedings of WNUT.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Bo Han</author>
<author>Timothy Baldwin</author>
</authors>
<title>Lexical normalisation of short text messages: Makn sens a #twitter.</title>
<date>2011</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL HLT 2011),</booktitle>
<pages>368--378</pages>
<location>Portland, USA.</location>
<contexts>
<context position="3197" citStr="Han and Baldwin, 2011" startWordPosition="477" endWordPosition="480"> Shared Task In this section, we outline the Twitter Text Normalization Shared Task, describing the data and annotation process, and outlining the approaches adopted by participants. 2.1 Background Non-standard words are present in many text genres, including advertisements, professional forums, and SMS messages. They can be the cause of reading and understanding problems for humans, and degrade the accuracy of text processing tools (Han et al., 2013; Plank et al., 2014a; Kong et al., 2014). Text normalization aims to transform non-standard words to their canonical forms (Sproat et al., 2001; Han and Baldwin, 2011) as shown in Figure 1. Common examples of non-standard words include abbreviations (e.g., u “you”), and non-standard spellings (e.g., cuming “coming” or 2mr “tomorrow”). The prevalence of non-standard words in social media text results in markedly higher out-of-vocabulary (OOV) rates; normalizing the text brings OOV rates down to more conventional levels and makes the text more amenable to automatic processing with off-theshelf tools which have been trained on edited text. Text normalization over Twitter data has been addressed at different granularities. For instance, non-standard words can b</context>
<context position="5529" citStr="Han and Baldwin (2011)" startWordPosition="837" endWordPosition="840">dual methods, and was our motivation in organizing the lexical normalization shared task. 2.2 Shared Task Design This lexical normalization shared task is focused exclusively on English, and was designed with three primary desiderata in mind: (1) to construct a much larger dataset than existing resources; (2) to allow all of 1:1, 1:N and N:1 word n-gramm appings; and (3) to cover not just OOV non-standard words but also non-standard words that happen to coincide in spelling with standard words. In all three regards, the shared task expands upon the scope of the de facto evaluation datasets of Han and Baldwin (2011) and Liu et al. (2011). One constraint that was placed on candidate tokens for normalization was that they should be all-alphanumeric. For normalization, we adopted American spelling. In order to establish a more level playing field for participants, but also encourage the use of a wide range of resources, participants were required to nominate their system categories: • Constrained: participants could not use any data other than the provided training data to perform the text normalization task. They were allowed to use pre-trained tools (e.g., Twitter POS taggers), but no normalization lexico</context>
</contexts>
<marker>Han, Baldwin, 2011</marker>
<rawString>Bo Han and Timothy Baldwin. 2011. Lexical normalisation of short text messages: Makn sens a #twitter. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL HLT 2011), pages 368–378, Portland, USA.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Bo Han</author>
<author>Paul Cook</author>
<author>Timothy Baldwin</author>
</authors>
<title>Automatically constructing a normalisation dictionary for microblogs.</title>
<date>2012</date>
<booktitle>In Proceedings of the Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning</booktitle>
<pages>421--432</pages>
<location>Jeju Island, Korea,</location>
<contexts>
<context position="4754" citStr="Han et al., 2012" startWordPosition="710" endWordPosition="713">zation examples translation task, whereby non-standard words are mapped to more canonical expressions (Aw et al., 2006). Other approaches have involved deep learning (Chrupała, 2014), cognitively-inspired approaches (Liu et al., 2012), random walks (Hassan and Menezes, 2013), and supervision using automatically-mined parallel data (Ling et al., 2013). One major challenge in text normalization research has been the lack of annotated data for training and evaluating methods. As a result, most Twitter text normalization methods have been unsupervised or semi-supervised (Cook and Stevenson, 2009; Han et al., 2012; Yang and Eisenstein, 2013), and evaluated over small-scale handannotated datasets. This has hampered analysis of the strengths and weaknesses of individual methods, and was our motivation in organizing the lexical normalization shared task. 2.2 Shared Task Design This lexical normalization shared task is focused exclusively on English, and was designed with three primary desiderata in mind: (1) to construct a much larger dataset than existing resources; (2) to allow all of 1:1, 1:N and N:1 word n-gramm appings; and (3) to cover not just OOV non-standard words but also non-standard words that</context>
</contexts>
<marker>Han, Cook, Baldwin, 2012</marker>
<rawString>Bo Han, Paul Cook, and Timothy Baldwin. 2012. Automatically constructing a normalisation dictionary for microblogs. In Proceedings of the Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning 2012 (EMNLP-CoNLL 2012), pages 421– 432, Jeju Island, Korea, July.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Bo Han</author>
<author>Paul Cook</author>
<author>Timothy Baldwin</author>
</authors>
<title>Lexical normalisation for social media text.</title>
<date>2013</date>
<journal>ACM Transactions on Intelligent Systems and Technology,</journal>
<volume>4</volume>
<issue>1</issue>
<contexts>
<context position="3029" citStr="Han et al., 2013" startWordPosition="450" endWordPosition="453">bmission from the winning team (ousia) achieved suprisingly good performance on this difficult task, near the level of inter-rater agreement. 2 Text Normalization Shared Task In this section, we outline the Twitter Text Normalization Shared Task, describing the data and annotation process, and outlining the approaches adopted by participants. 2.1 Background Non-standard words are present in many text genres, including advertisements, professional forums, and SMS messages. They can be the cause of reading and understanding problems for humans, and degrade the accuracy of text processing tools (Han et al., 2013; Plank et al., 2014a; Kong et al., 2014). Text normalization aims to transform non-standard words to their canonical forms (Sproat et al., 2001; Han and Baldwin, 2011) as shown in Figure 1. Common examples of non-standard words include abbreviations (e.g., u “you”), and non-standard spellings (e.g., cuming “coming” or 2mr “tomorrow”). The prevalence of non-standard words in social media text results in markedly higher out-of-vocabulary (OOV) rates; normalizing the text brings OOV rates down to more conventional levels and makes the text more amenable to automatic processing with off-theshelf </context>
</contexts>
<marker>Han, Cook, Baldwin, 2013</marker>
<rawString>Bo Han, Paul Cook, and Timothy Baldwin. 2013. Lexical normalisation for social media text. ACM Transactions on Intelligent Systems and Technology, 4(1):5:1–5:27.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Hany Hassan</author>
<author>Arul Menezes</author>
</authors>
<title>Social text normalization using contextual graph random walks.</title>
<date>2013</date>
<booktitle>In Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (ACL</booktitle>
<pages>1577--1586</pages>
<location>Sofia, Bulgaria,</location>
<contexts>
<context position="4413" citStr="Hassan and Menezes, 2013" startWordPosition="655" endWordPosition="659">rds can be considered as spelling errors at the character (Liu et al., 2011) or word level (Wang and Ng, 2013). Text normalization can also be approached as a machine 126 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 126–135, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics Figure 1: Normalization examples translation task, whereby non-standard words are mapped to more canonical expressions (Aw et al., 2006). Other approaches have involved deep learning (Chrupała, 2014), cognitively-inspired approaches (Liu et al., 2012), random walks (Hassan and Menezes, 2013), and supervision using automatically-mined parallel data (Ling et al., 2013). One major challenge in text normalization research has been the lack of annotated data for training and evaluating methods. As a result, most Twitter text normalization methods have been unsupervised or semi-supervised (Cook and Stevenson, 2009; Han et al., 2012; Yang and Eisenstein, 2013), and evaluated over small-scale handannotated datasets. This has hampered analysis of the strengths and weaknesses of individual methods, and was our motivation in organizing the lexical normalization shared task. 2.2 Shared Task </context>
</contexts>
<marker>Hassan, Menezes, 2013</marker>
<rawString>Hany Hassan and Arul Menezes. 2013. Social text normalization using contextual graph random walks. In Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (ACL 2013), pages 1577–1586, Sofia, Bulgaria, August.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Ning Jin</author>
</authors>
<title>Ncsu-sas-ning: Candidate generation and feature engineering for supervised lexical normalization.</title>
<date>2015</date>
<booktitle>In proceedings of WNUT,</booktitle>
<location>Beijing, China.</location>
<contexts>
<context position="12206" citStr="Jin, 2015" startWordPosition="1910" endWordPosition="1911"> shown in Tables 3 and 4. Overall, common approaches were lexicon-based methods, CRFs, and neural network-based approaches. Among the constrained systems, neural networks achieved strong results, even without off-the-shelf tools. In contrast, CRF- and lexicon-based approaches were shown to be effective in the unconstrained category. Surprisingly, the best overall result was achieved by a constrained system, suggesting that the relative advantage in accessing additional datasets or resources has less impact than the quality of the underlying model that is used to model the task. NCSU SAS NING (Jin, 2015) Normalization candidates were generated based on the training data, and scored based on Jaccard index over character n-gram[ s]. Candidates were evaluated using random forest classifiers to offset parameter sensitivity, using features including normalization statistics, string similarity and POS. NCSU SAS WOOKHEE (Min et al., 2015) Word-level edits are predicted based on long-short term memory (LSTM) recurrent neural networks (RNN), using character sequences and POS tags as features. The LSTM is further complemented with a normalization lexicon induced from the training data. NCSU SAS SAM (Le</context>
</contexts>
<marker>Jin, 2015</marker>
<rawString>Ning Jin. 2015. Ncsu-sas-ning: Candidate generation and feature engineering for supervised lexical normalization. In proceedings of WNUT, Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Lingpeng Kong</author>
<author>Nathan Schneider</author>
<author>Swabha Swayamdipta</author>
<author>Archna Bhatia</author>
<author>Chris Dyer</author>
<author>A Noah Smith</author>
</authors>
<title>A dependency parser for tweets.</title>
<date>2014</date>
<booktitle>In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP),</booktitle>
<pages>1001--1012</pages>
<contexts>
<context position="3070" citStr="Kong et al., 2014" startWordPosition="458" endWordPosition="461">achieved suprisingly good performance on this difficult task, near the level of inter-rater agreement. 2 Text Normalization Shared Task In this section, we outline the Twitter Text Normalization Shared Task, describing the data and annotation process, and outlining the approaches adopted by participants. 2.1 Background Non-standard words are present in many text genres, including advertisements, professional forums, and SMS messages. They can be the cause of reading and understanding problems for humans, and degrade the accuracy of text processing tools (Han et al., 2013; Plank et al., 2014a; Kong et al., 2014). Text normalization aims to transform non-standard words to their canonical forms (Sproat et al., 2001; Han and Baldwin, 2011) as shown in Figure 1. Common examples of non-standard words include abbreviations (e.g., u “you”), and non-standard spellings (e.g., cuming “coming” or 2mr “tomorrow”). The prevalence of non-standard words in social media text results in markedly higher out-of-vocabulary (OOV) rates; normalizing the text brings OOV rates down to more conventional levels and makes the text more amenable to automatic processing with off-theshelf tools which have been trained on edited t</context>
</contexts>
<marker>Kong, Schneider, Swayamdipta, Bhatia, Dyer, Smith, 2014</marker>
<rawString>Lingpeng Kong, Nathan Schneider, Swabha Swayamdipta, Archna Bhatia, Chris Dyer, and A. Noah Smith. 2014. A dependency parser for tweets. In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 1001–1012.</rawString>
</citation>
<citation valid="true">
<authors>
<author>John D Lafferty</author>
<author>Andrew McCallum</author>
<author>Fernando C N Pereira</author>
</authors>
<title>Conditional random fields: Probabilistic models for segmenting and labeling sequence data.</title>
<date>2001</date>
<booktitle>In Proceedings of the Eighteenth International Conference on Machine Learning.</booktitle>
<contexts>
<context position="1977" citStr="Lafferty et al., 2001" startWordPosition="286" endWordPosition="289">e-shelf tools; unconstrained systems, on the other hand, were free to use any public tools and resources. There were 6 official submissions in the constrained category, and 5 official submissions in the unconstrained category. Overall, deep learning methods and methods based on lexicon-augmented conditional random fields (CRFs) achieved the best results. The winning team achieved a precision of 0.9061 precision, recall of 0.7865, and F1 of 0.8421. The named entity recognition task attracted 8 participants. The majority of teams built their systems using linear-chain conditional random fields (Lafferty et al., 2001), and many teams also used brown clusters and word embedding features (Turian et al., 2010). Notable new techniques for named entity recognition in Twitter include a semi-Markov MIRA trained tagger (nrc), an end-to-end neural network using no handengineered features (multimedialab), an approach that weights training data to compensate for concept drift (USFD), and a differential evolution approach to feature selection (iitp). The submission from the winning team (ousia) achieved suprisingly good performance on this difficult task, near the level of inter-rater agreement. 2 Text Normalization S</context>
</contexts>
<marker>Lafferty, McCallum, Pereira, 2001</marker>
<rawString>John D. Lafferty, Andrew McCallum, and Fernando C. N. Pereira. 2001. Conditional random fields: Probabilistic models for segmenting and labeling sequence data. In Proceedings of the Eighteenth International Conference on Machine Learning.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Samuel Leeman-Munk</author>
<author>James Lester</author>
<author>James Cox</author>
</authors>
<title>Ncsu sas sam: Deep encoding and reconstruction for normalization of noisy text.</title>
<date>2015</date>
<booktitle>In proceedings of WNUT,</booktitle>
<location>Beijing, China.</location>
<contexts>
<context position="12829" citStr="Leeman-Munk et al., 2015" startWordPosition="1997" endWordPosition="2000">5) Normalization candidates were generated based on the training data, and scored based on Jaccard index over character n-gram[ s]. Candidates were evaluated using random forest classifiers to offset parameter sensitivity, using features including normalization statistics, string similarity and POS. NCSU SAS WOOKHEE (Min et al., 2015) Word-level edits are predicted based on long-short term memory (LSTM) recurrent neural networks (RNN), using character sequences and POS tags as features. The LSTM is further complemented with a normalization lexicon induced from the training data. NCSU SAS SAM (Leeman-Munk et al., 2015) Two forward feed neural networks are used to predict: (1) the normalized token given an input token; and (2) whether a word should be normalized or left intact. Normalized tokens are further edited by a “conformer” which down-weights rare words as normalization candidates. IITP (Akhtar et al., 2015b) A CRF model is trained over the training data, with features including word sequences, POS tags and morphology features. Post-processing heuristics are used to post-edit the output of the CRF. 5One team (GIGO) didn’t submit a description paper. DCU-ADAPT (Wagner and Foster, 2015) A generalized pe</context>
</contexts>
<marker>Leeman-Munk, Lester, Cox, 2015</marker>
<rawString>Samuel Leeman-Munk, James Lester, and James Cox. 2015. Ncsu sas sam: Deep encoding and reconstruction for normalization of noisy text. In proceedings of WNUT, Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Wang Ling</author>
<author>Chris Dyer</author>
<author>Alan W Black</author>
<author>Isabel Trancoso</author>
</authors>
<title>Paraphrasing 4 microblog normalization.</title>
<date>2013</date>
<booktitle>In Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing (EMNLP 2013),</booktitle>
<pages>73--84</pages>
<location>Seattle, USA,</location>
<contexts>
<context position="4490" citStr="Ling et al., 2013" startWordPosition="667" endWordPosition="670"> level (Wang and Ng, 2013). Text normalization can also be approached as a machine 126 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 126–135, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics Figure 1: Normalization examples translation task, whereby non-standard words are mapped to more canonical expressions (Aw et al., 2006). Other approaches have involved deep learning (Chrupała, 2014), cognitively-inspired approaches (Liu et al., 2012), random walks (Hassan and Menezes, 2013), and supervision using automatically-mined parallel data (Ling et al., 2013). One major challenge in text normalization research has been the lack of annotated data for training and evaluating methods. As a result, most Twitter text normalization methods have been unsupervised or semi-supervised (Cook and Stevenson, 2009; Han et al., 2012; Yang and Eisenstein, 2013), and evaluated over small-scale handannotated datasets. This has hampered analysis of the strengths and weaknesses of individual methods, and was our motivation in organizing the lexical normalization shared task. 2.2 Shared Task Design This lexical normalization shared task is focused exclusively on Engli</context>
</contexts>
<marker>Ling, Dyer, Black, Trancoso, 2013</marker>
<rawString>Wang Ling, Chris Dyer, Alan W Black, and Isabel Trancoso. 2013. Paraphrasing 4 microblog normalization. In Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing (EMNLP 2013), pages 73–84, Seattle, USA, October.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Fei Liu</author>
<author>Fuliang Weng</author>
<author>Bingqing Wang</author>
<author>Yang Liu</author>
</authors>
<title>Insertion, deletion, or substitution? Normalizing text messages without pre-categorization nor supervision.</title>
<date>2011</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL HLT 2011),</booktitle>
<pages>71--76</pages>
<location>Portland, USA.</location>
<contexts>
<context position="3864" citStr="Liu et al., 2011" startWordPosition="576" endWordPosition="579">dard words include abbreviations (e.g., u “you”), and non-standard spellings (e.g., cuming “coming” or 2mr “tomorrow”). The prevalence of non-standard words in social media text results in markedly higher out-of-vocabulary (OOV) rates; normalizing the text brings OOV rates down to more conventional levels and makes the text more amenable to automatic processing with off-theshelf tools which have been trained on edited text. Text normalization over Twitter data has been addressed at different granularities. For instance, non-standard words can be considered as spelling errors at the character (Liu et al., 2011) or word level (Wang and Ng, 2013). Text normalization can also be approached as a machine 126 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 126–135, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics Figure 1: Normalization examples translation task, whereby non-standard words are mapped to more canonical expressions (Aw et al., 2006). Other approaches have involved deep learning (Chrupała, 2014), cognitively-inspired approaches (Liu et al., 2012), random walks (Hassan and Menezes, 2013), and supervision using automatically-mined paralle</context>
<context position="5551" citStr="Liu et al. (2011)" startWordPosition="842" endWordPosition="845">otivation in organizing the lexical normalization shared task. 2.2 Shared Task Design This lexical normalization shared task is focused exclusively on English, and was designed with three primary desiderata in mind: (1) to construct a much larger dataset than existing resources; (2) to allow all of 1:1, 1:N and N:1 word n-gramm appings; and (3) to cover not just OOV non-standard words but also non-standard words that happen to coincide in spelling with standard words. In all three regards, the shared task expands upon the scope of the de facto evaluation datasets of Han and Baldwin (2011) and Liu et al. (2011). One constraint that was placed on candidate tokens for normalization was that they should be all-alphanumeric. For normalization, we adopted American spelling. In order to establish a more level playing field for participants, but also encourage the use of a wide range of resources, participants were required to nominate their system categories: • Constrained: participants could not use any data other than the provided training data to perform the text normalization task. They were allowed to use pre-trained tools (e.g., Twitter POS taggers), but no normalization lexicons or extra tweet data</context>
</contexts>
<marker>Liu, Weng, Wang, Liu, 2011</marker>
<rawString>Fei Liu, Fuliang Weng, Bingqing Wang, and Yang Liu. 2011. Insertion, deletion, or substitution? Normalizing text messages without pre-categorization nor supervision. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL HLT 2011), pages 71–76, Portland, USA.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Fei Liu</author>
<author>Fuliang Weng</author>
<author>Xiao Jiang</author>
</authors>
<title>A broad-coverage normalization system for social media language.</title>
<date>2012</date>
<booktitle>In Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (ACL 2012),</booktitle>
<pages>1035--1044</pages>
<location>Jeju Island, Korea,</location>
<contexts>
<context position="4372" citStr="Liu et al., 2012" startWordPosition="649" endWordPosition="652">es. For instance, non-standard words can be considered as spelling errors at the character (Liu et al., 2011) or word level (Wang and Ng, 2013). Text normalization can also be approached as a machine 126 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 126–135, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics Figure 1: Normalization examples translation task, whereby non-standard words are mapped to more canonical expressions (Aw et al., 2006). Other approaches have involved deep learning (Chrupała, 2014), cognitively-inspired approaches (Liu et al., 2012), random walks (Hassan and Menezes, 2013), and supervision using automatically-mined parallel data (Ling et al., 2013). One major challenge in text normalization research has been the lack of annotated data for training and evaluating methods. As a result, most Twitter text normalization methods have been unsupervised or semi-supervised (Cook and Stevenson, 2009; Han et al., 2012; Yang and Eisenstein, 2013), and evaluated over small-scale handannotated datasets. This has hampered analysis of the strengths and weaknesses of individual methods, and was our motivation in organizing the lexical no</context>
</contexts>
<marker>Liu, Weng, Jiang, 2012</marker>
<rawString>Fei Liu, Fuliang Weng, and Xiao Jiang. 2012. A broad-coverage normalization system for social media language. In Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (ACL 2012), pages 1035–1044, Jeju Island, Korea, July.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Marco Lui</author>
<author>Timothy Baldwin</author>
</authors>
<title>langid.py: An off-the-shelf language identification tool.</title>
<date>2012</date>
<booktitle>In Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (ACL 2012) Demo Session,</booktitle>
<pages>25--30</pages>
<location>Jeju, Republic of</location>
<contexts>
<context position="6496" citStr="Lui and Baldwin, 2012" startWordPosition="987" endWordPosition="990">e their system categories: • Constrained: participants could not use any data other than the provided training data to perform the text normalization task. They were allowed to use pre-trained tools (e.g., Twitter POS taggers), but no normalization lexicons or extra tweet data. • Unconstrained: participants could use any publicly accessible data or tools to perform the text normalization task. Evaluation was based on token-level precision, recall and F-score. 2.2.1 Preprocessing We first collected tweets using the Twitter Streaming API over the period 23–29 May, 2014, and then used langid.py (Lui and Baldwin, 2012)1 to remove all non-English tweets. Tokenization was performed with CMU-ARK tokeniser.2 To ensure that tweets had a high likelihood of requiring lexical normalization, we filtered out tweets with less than 2 non-standard words (i.e. words not occurring in our dictionary — see Section 2.2.3). While this biases the sample of tweets, the decision was made at a pragmatic level to ensure a reasonable level of lexical normalization and “annotation density”. This was based on a pilot study over a random sample of English tweets, in which we found that many non-standard words were actually unknown nam</context>
</contexts>
<marker>Lui, Baldwin, 2012</marker>
<rawString>Marco Lui and Timothy Baldwin. 2012. langid.py: An off-the-shelf language identification tool. In Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (ACL 2012) Demo Session, pages 25–30, Jeju, Republic of Korea.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Wookhee Min</author>
<author>Bradford Mott</author>
<author>James Lester</author>
<author>James Cox</author>
</authors>
<title>Ncsu sas wookhee: A deep contextual long-short term memory model for text normalization.</title>
<date>2015</date>
<booktitle>In proceedings of WNUT,</booktitle>
<location>Beijing, China.</location>
<contexts>
<context position="12540" citStr="Min et al., 2015" startWordPosition="1954" endWordPosition="1957">tegory. Surprisingly, the best overall result was achieved by a constrained system, suggesting that the relative advantage in accessing additional datasets or resources has less impact than the quality of the underlying model that is used to model the task. NCSU SAS NING (Jin, 2015) Normalization candidates were generated based on the training data, and scored based on Jaccard index over character n-gram[ s]. Candidates were evaluated using random forest classifiers to offset parameter sensitivity, using features including normalization statistics, string similarity and POS. NCSU SAS WOOKHEE (Min et al., 2015) Word-level edits are predicted based on long-short term memory (LSTM) recurrent neural networks (RNN), using character sequences and POS tags as features. The LSTM is further complemented with a normalization lexicon induced from the training data. NCSU SAS SAM (Leeman-Munk et al., 2015) Two forward feed neural networks are used to predict: (1) the normalized token given an input token; and (2) whether a word should be normalized or left intact. Normalized tokens are further edited by a “conformer” which down-weights rare words as normalization candidates. IITP (Akhtar et al., 2015b) A CRF mo</context>
</contexts>
<marker>Min, Mott, Lester, Cox, 2015</marker>
<rawString>Wookhee Min, Bradford Mott, James Lester, and James Cox. 2015. Ncsu sas wookhee: A deep contextual long-short term memory model for text normalization. In proceedings of WNUT, Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Yerai Doval Mosquera</author>
<author>Jes´us Vilares</author>
<author>Carlos G´omez-Rodriguez</author>
</authors>
<title>Lysgroup: Adapting a spanish microtext normalization system to english.</title>
<date>2015</date>
<booktitle>In proceedings of WNUT,</booktitle>
<location>Beijing, China.</location>
<marker>Mosquera, Vilares, G´omez-Rodriguez, 2015</marker>
<rawString>Yerai Doval Mosquera, Jes´us Vilares, and Carlos G´omez-Rodriguez. 2015. Lysgroup: Adapting a spanish microtext normalization system to english. In proceedings of WNUT, Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Barbara Plank</author>
<author>Dirk Hovy</author>
<author>Ryan McDonald</author>
<author>Anders Søgaard</author>
</authors>
<title>Adapting taggers to twitter with not-so-distant supervision.</title>
<date>2014</date>
<booktitle>In Proceedings of COLING 2014, the 25th International Conference on Computational Linguistics: Technical Papers,</booktitle>
<pages>1783--1792</pages>
<contexts>
<context position="3049" citStr="Plank et al., 2014" startWordPosition="454" endWordPosition="457">winning team (ousia) achieved suprisingly good performance on this difficult task, near the level of inter-rater agreement. 2 Text Normalization Shared Task In this section, we outline the Twitter Text Normalization Shared Task, describing the data and annotation process, and outlining the approaches adopted by participants. 2.1 Background Non-standard words are present in many text genres, including advertisements, professional forums, and SMS messages. They can be the cause of reading and understanding problems for humans, and degrade the accuracy of text processing tools (Han et al., 2013; Plank et al., 2014a; Kong et al., 2014). Text normalization aims to transform non-standard words to their canonical forms (Sproat et al., 2001; Han and Baldwin, 2011) as shown in Figure 1. Common examples of non-standard words include abbreviations (e.g., u “you”), and non-standard spellings (e.g., cuming “coming” or 2mr “tomorrow”). The prevalence of non-standard words in social media text results in markedly higher out-of-vocabulary (OOV) rates; normalizing the text brings OOV rates down to more conventional levels and makes the text more amenable to automatic processing with off-theshelf tools which have bee</context>
<context position="17187" citStr="Plank et al., 2014" startWordPosition="2670" endWordPosition="2673">(Dredze et al., 2010; Fromreide et al., 2014); the distribution of language and topics on Twitter is constantly shifting leading to degraded performance of NLP tools over time. To evaluate the effect of drift in a realistic scenario, the current evaluation uses a test set from a separate time period, which was not announced to participants until the (unannotated) test data was released at the beginning of the evaluation period. To address these challenges, there has been an increasing body of work on adapting named entity recognition tools to noisy social media text (Derczynski et al., 2015b; Plank et al., 2014a; Cherry and Guo, 2015; Ritter et al., 2011; Plank et al., 2014b), however different research groups have made use of different evaluation setups (e.g. training / test splits) making it challenging to perform direct comparisons across systems. By organizing a shared evaluation we hope to help establish a common evaluation methodology (for at least one dataset) and also promote research and development of NLP tools for user-generated social media text genres. 3.1 Training and Development Data The training and development data for our task was taken from previous work on Twitter NER (Ritter et </context>
</contexts>
<marker>Plank, Hovy, McDonald, Søgaard, 2014</marker>
<rawString>Barbara Plank, Dirk Hovy, Ryan McDonald, and Anders Søgaard. 2014a. Adapting taggers to twitter with not-so-distant supervision. In Proceedings of COLING 2014, the 25th International Conference on Computational Linguistics: Technical Papers, pages 1783–1792.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Barbara Plank</author>
<author>Dirk Hovy</author>
<author>Anders Søgaard</author>
</authors>
<title>Learning part-of-speech taggers with interannotator agreement loss.</title>
<date>2014</date>
<booktitle>In Proceedings of EACL.</booktitle>
<contexts>
<context position="3049" citStr="Plank et al., 2014" startWordPosition="454" endWordPosition="457">winning team (ousia) achieved suprisingly good performance on this difficult task, near the level of inter-rater agreement. 2 Text Normalization Shared Task In this section, we outline the Twitter Text Normalization Shared Task, describing the data and annotation process, and outlining the approaches adopted by participants. 2.1 Background Non-standard words are present in many text genres, including advertisements, professional forums, and SMS messages. They can be the cause of reading and understanding problems for humans, and degrade the accuracy of text processing tools (Han et al., 2013; Plank et al., 2014a; Kong et al., 2014). Text normalization aims to transform non-standard words to their canonical forms (Sproat et al., 2001; Han and Baldwin, 2011) as shown in Figure 1. Common examples of non-standard words include abbreviations (e.g., u “you”), and non-standard spellings (e.g., cuming “coming” or 2mr “tomorrow”). The prevalence of non-standard words in social media text results in markedly higher out-of-vocabulary (OOV) rates; normalizing the text brings OOV rates down to more conventional levels and makes the text more amenable to automatic processing with off-theshelf tools which have bee</context>
<context position="17187" citStr="Plank et al., 2014" startWordPosition="2670" endWordPosition="2673">(Dredze et al., 2010; Fromreide et al., 2014); the distribution of language and topics on Twitter is constantly shifting leading to degraded performance of NLP tools over time. To evaluate the effect of drift in a realistic scenario, the current evaluation uses a test set from a separate time period, which was not announced to participants until the (unannotated) test data was released at the beginning of the evaluation period. To address these challenges, there has been an increasing body of work on adapting named entity recognition tools to noisy social media text (Derczynski et al., 2015b; Plank et al., 2014a; Cherry and Guo, 2015; Ritter et al., 2011; Plank et al., 2014b), however different research groups have made use of different evaluation setups (e.g. training / test splits) making it challenging to perform direct comparisons across systems. By organizing a shared evaluation we hope to help establish a common evaluation methodology (for at least one dataset) and also promote research and development of NLP tools for user-generated social media text genres. 3.1 Training and Development Data The training and development data for our task was taken from previous work on Twitter NER (Ritter et </context>
</contexts>
<marker>Plank, Hovy, Søgaard, 2014</marker>
<rawString>Barbara Plank, Dirk Hovy, and Anders Søgaard. 2014b. Learning part-of-speech taggers with interannotator agreement loss. In Proceedings of EACL.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Alan Ritter</author>
<author>Sam Clark</author>
<author>Oren Etzioni</author>
</authors>
<title>Named entity recognition in tweets: an experimental study.</title>
<date>2011</date>
<booktitle>In Proceedings of the Conference on Empirical Methods in Natural Language Processing,</booktitle>
<pages>1524--1534</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="17231" citStr="Ritter et al., 2011" startWordPosition="2678" endWordPosition="2681">4); the distribution of language and topics on Twitter is constantly shifting leading to degraded performance of NLP tools over time. To evaluate the effect of drift in a realistic scenario, the current evaluation uses a test set from a separate time period, which was not announced to participants until the (unannotated) test data was released at the beginning of the evaluation period. To address these challenges, there has been an increasing body of work on adapting named entity recognition tools to noisy social media text (Derczynski et al., 2015b; Plank et al., 2014a; Cherry and Guo, 2015; Ritter et al., 2011; Plank et al., 2014b), however different research groups have made use of different evaluation setups (e.g. training / test splits) making it challenging to perform direct comparisons across systems. By organizing a shared evaluation we hope to help establish a common evaluation methodology (for at least one dataset) and also promote research and development of NLP tools for user-generated social media text genres. 3.1 Training and Development Data The training and development data for our task was taken from previous work on Twitter NER (Ritter et al., 2011), which distinguishes 10 different</context>
</contexts>
<marker>Ritter, Clark, Etzioni, 2011</marker>
<rawString>Alan Ritter, Sam Clark, Oren Etzioni, et al. 2011. Named entity recognition in tweets: an experimental study. In Proceedings of the Conference on Empirical Methods in Natural Language Processing, pages 1524–1534. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Richard Sproat</author>
<author>Alan W Black</author>
<author>Stanley Chen</author>
<author>Shankar Kumar</author>
<author>Mari Ostendorf</author>
<author>Christopher Richards</author>
</authors>
<title>Normalization of non-standard words.</title>
<date>2001</date>
<journal>Computer Speech and Language,</journal>
<volume>15</volume>
<issue>3</issue>
<contexts>
<context position="3173" citStr="Sproat et al., 2001" startWordPosition="473" endWordPosition="476"> 2 Text Normalization Shared Task In this section, we outline the Twitter Text Normalization Shared Task, describing the data and annotation process, and outlining the approaches adopted by participants. 2.1 Background Non-standard words are present in many text genres, including advertisements, professional forums, and SMS messages. They can be the cause of reading and understanding problems for humans, and degrade the accuracy of text processing tools (Han et al., 2013; Plank et al., 2014a; Kong et al., 2014). Text normalization aims to transform non-standard words to their canonical forms (Sproat et al., 2001; Han and Baldwin, 2011) as shown in Figure 1. Common examples of non-standard words include abbreviations (e.g., u “you”), and non-standard spellings (e.g., cuming “coming” or 2mr “tomorrow”). The prevalence of non-standard words in social media text results in markedly higher out-of-vocabulary (OOV) rates; normalizing the text brings OOV rates down to more conventional levels and makes the text more amenable to automatic processing with off-theshelf tools which have been trained on edited text. Text normalization over Twitter data has been addressed at different granularities. For instance, </context>
</contexts>
<marker>Sproat, Black, Chen, Kumar, Ostendorf, Richards, 2001</marker>
<rawString>Richard Sproat, Alan W. Black, Stanley Chen, Shankar Kumar, Mari Ostendorf, and Christopher Richards. 2001. Normalization of non-standard words. Computer Speech and Language, 15(3):287–333.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Dmitry Supranovich</author>
<author>Viachaslau Patsepnia</author>
</authors>
<title>Ihs rd: Lexical normalization for english tweets.</title>
<date>2015</date>
<booktitle>In proceedings of WNUT,</booktitle>
<location>Beijing, China.</location>
<contexts>
<context position="13753" citStr="Supranovich and Patsepnia, 2015" startWordPosition="2143" endWordPosition="2146">l., 2015b) A CRF model is trained over the training data, with features including word sequences, POS tags and morphology features. Post-processing heuristics are used to post-edit the output of the CRF. 5One team (GIGO) didn’t submit a description paper. DCU-ADAPT (Wagner and Foster, 2015) A generalized perceptron method is used generate word edit operations, with features including character n-gram[ s], character classes, and RNN language model hidden layer activation features. The final normalization word is selected based on the noisy channel model with a character language model. IHD RD (Supranovich and Patsepnia, 2015) non-standard words are identified using a CRF tagger, using features such as token-level features, contextual tokens, dictionary lookup, and edit distance. Multiple lexicons are combined to generate normalization candidates. A query misspelling correction module (i.e., DidYouMean) is used to post-process the output. USZEGED (Berend and Tasn´adi, 2015) A CRF model is used to identify tokens requiring normalization, and determine the type of normalization required. Normalization candidates are then proposed based on revised edit distance. The final normalization candidate is selected on the bas</context>
</contexts>
<marker>Supranovich, Patsepnia, 2015</marker>
<rawString>Dmitry Supranovich and Viachaslau Patsepnia. 2015. Ihs rd: Lexical normalization for english tweets. In proceedings of WNUT, Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Tian Tian</author>
</authors>
<title>Data adaptation for named entity recognition on tweets with features-rich crf.</title>
<date>2015</date>
<booktitle>In proceedings of WNUT.</booktitle>
<contexts>
<context position="24258" citStr="Tian, 2015" startWordPosition="3825" endWordPosition="3826">lym 58.36 48.5 53.01 lattice 55.17 9.68 16.47 lattice 58.42 25.72 35.71 BASELINE 35.56 29.05 31.97 BASELINE 53.86 46.44 49.88 Table 8: Results segmenting and categorizing entities into 10 types. Hallym (Yang and Kim, 2015) The Hallym team used an approach based on CRFs using both Brown clusters and word embeddings trained using Canonical Correlation Analysis as features. iitp (Akhtar et al., 2015a) The iitp team proTable 9: Results on segmentation only (no types). posed a multi-objective differential evolution based technique for feature selection in twitter named entity recognition. lattice (Tian, 2015) Lattice employed a CRF model using Wapiti. The feature templates consisted of standard features used in stateof-the-art. They trained first a model with 132 dev 2015 and evaluated this model on train and dev. multimedialab (Godin et al., 2015) The goal of the multimedia lab system was to only use neural networks and word embeddings to show the power of automatic feature learning and semi-supervised methods. A FeedForward Neural Network was first trained, that used only word2vec word embeddings as input. Word embeddings were trained on 400 million unlabeled tweets. Leaky ReLUs were used as act</context>
</contexts>
<marker>Tian, 2015</marker>
<rawString>Tian Tian. 2015. Data adaptation for named entity recognition on tweets with features-rich crf. In proceedings of WNUT.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Erik F Tjong Kim Sang</author>
<author>Fien De Meulder</author>
</authors>
<title>Introduction to the conll-2003 shared task: Language-independent named entity recognition.</title>
<date>2003</date>
<booktitle>In Proceedings of the seventh conference on Natural language learning at HLT-NAACL 2003-Volume 4,</booktitle>
<pages>142--147</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<marker>Sang, De Meulder, 2003</marker>
<rawString>Erik F Tjong Kim Sang and Fien De Meulder. 2003. Introduction to the conll-2003 shared task: Language-independent named entity recognition. In Proceedings of the seventh conference on Natural language learning at HLT-NAACL 2003-Volume 4, pages 142–147. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Zhiqiang Toh</author>
<author>Bin Chen</author>
<author>Jian Su</author>
</authors>
<title>Improving twitter named entity recognition using word representations.</title>
<date>2015</date>
<booktitle>In proceedings of WNUT.</booktitle>
<contexts>
<context position="25318" citStr="Toh et al., 2015" startWordPosition="4008" endWordPosition="4011">s first trained, that used only word2vec word embeddings as input. Word embeddings were trained on 400 million unlabeled tweets. Leaky ReLUs were used as activation function in combination with dropout to prevent overfitting. A context window of 5 words was used As input (2 words left and right). The output is a single tag of the middle word. Afterwards, a rule-based post-processing step was executed to ensure every I-tag has a B-tag in front of it and that all tags within a single span are of the same type. Train and dev were used as training data and used dev 2015 as validation set. NLANGP (Toh et al., 2015) The NLANGP team modeled the problem as a sequential labeling task and used Conditional Random Fields. Several post-processing steps (e.g. rulebased matching) were applied to refine the system output. Besides Brown clusters, Kmeans clusters were also used; the K-means clusters were generated based on word embeddings. nrc (Cherry et al., 2015) NRC applied a MIRAtrained semi-Markov tagger with Gazetteer, Brown cluster and Word Embedding features. The Word Embeddings were built over phrases using Word2Vec’s phrase finder tool, and were modified using an auto-encoder to be predictive of Gazetteer </context>
</contexts>
<marker>Toh, Chen, Su, 2015</marker>
<rawString>Zhiqiang Toh, Bin Chen, and Jian Su. 2015. Improving twitter named entity recognition using word representations. In proceedings of WNUT.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Joseph Turian</author>
<author>Lev Ratinov</author>
<author>Yoshua Bengio</author>
</authors>
<title>Word representations: a simple and general method for semi-supervised learning.</title>
<date>2010</date>
<booktitle>In Proceedings of the 48th annual meeting of the association for computational linguistics,</booktitle>
<pages>384--394</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="2068" citStr="Turian et al., 2010" startWordPosition="302" endWordPosition="305">d resources. There were 6 official submissions in the constrained category, and 5 official submissions in the unconstrained category. Overall, deep learning methods and methods based on lexicon-augmented conditional random fields (CRFs) achieved the best results. The winning team achieved a precision of 0.9061 precision, recall of 0.7865, and F1 of 0.8421. The named entity recognition task attracted 8 participants. The majority of teams built their systems using linear-chain conditional random fields (Lafferty et al., 2001), and many teams also used brown clusters and word embedding features (Turian et al., 2010). Notable new techniques for named entity recognition in Twitter include a semi-Markov MIRA trained tagger (nrc), an end-to-end neural network using no handengineered features (multimedialab), an approach that weights training data to compensate for concept drift (USFD), and a differential evolution approach to feature selection (iitp). The submission from the winning team (ousia) achieved suprisingly good performance on this difficult task, near the level of inter-rater agreement. 2 Text Normalization Shared Task In this section, we outline the Twitter Text Normalization Shared Task, describi</context>
</contexts>
<marker>Turian, Ratinov, Bengio, 2010</marker>
<rawString>Joseph Turian, Lev Ratinov, and Yoshua Bengio. 2010. Word representations: a simple and general method for semi-supervised learning. In Proceedings of the 48th annual meeting of the association for computational linguistics, pages 384–394. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Joachim Wagner</author>
<author>Jennifer Foster</author>
</authors>
<title>Dcuadapt: Learning edit operations for microblog normalisation with the generalised perceptron.</title>
<date>2015</date>
<booktitle>In proceedings of WNUT,</booktitle>
<location>Beijing, China.</location>
<contexts>
<context position="13412" citStr="Wagner and Foster, 2015" startWordPosition="2092" endWordPosition="2095">. NCSU SAS SAM (Leeman-Munk et al., 2015) Two forward feed neural networks are used to predict: (1) the normalized token given an input token; and (2) whether a word should be normalized or left intact. Normalized tokens are further edited by a “conformer” which down-weights rare words as normalization candidates. IITP (Akhtar et al., 2015b) A CRF model is trained over the training data, with features including word sequences, POS tags and morphology features. Post-processing heuristics are used to post-edit the output of the CRF. 5One team (GIGO) didn’t submit a description paper. DCU-ADAPT (Wagner and Foster, 2015) A generalized perceptron method is used generate word edit operations, with features including character n-gram[ s], character classes, and RNN language model hidden layer activation features. The final normalization word is selected based on the noisy channel model with a character language model. IHD RD (Supranovich and Patsepnia, 2015) non-standard words are identified using a CRF tagger, using features such as token-level features, contextual tokens, dictionary lookup, and edit distance. Multiple lexicons are combined to generate normalization candidates. A query misspelling correction mo</context>
</contexts>
<marker>Wagner, Foster, 2015</marker>
<rawString>Joachim Wagner and Jennifer Foster. 2015. Dcuadapt: Learning edit operations for microblog normalisation with the generalised perceptron. In proceedings of WNUT, Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Pidong Wang</author>
<author>Hwee Tou Ng</author>
</authors>
<title>A beamsearch decoder for normalization of social media text with application to machine translation.</title>
<date>2013</date>
<booktitle>In Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL HLT 2013),</booktitle>
<pages>471--481</pages>
<location>Atlanta, USA,</location>
<contexts>
<context position="3898" citStr="Wang and Ng, 2013" startWordPosition="583" endWordPosition="586">(e.g., u “you”), and non-standard spellings (e.g., cuming “coming” or 2mr “tomorrow”). The prevalence of non-standard words in social media text results in markedly higher out-of-vocabulary (OOV) rates; normalizing the text brings OOV rates down to more conventional levels and makes the text more amenable to automatic processing with off-theshelf tools which have been trained on edited text. Text normalization over Twitter data has been addressed at different granularities. For instance, non-standard words can be considered as spelling errors at the character (Liu et al., 2011) or word level (Wang and Ng, 2013). Text normalization can also be approached as a machine 126 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 126–135, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics Figure 1: Normalization examples translation task, whereby non-standard words are mapped to more canonical expressions (Aw et al., 2006). Other approaches have involved deep learning (Chrupała, 2014), cognitively-inspired approaches (Liu et al., 2012), random walks (Hassan and Menezes, 2013), and supervision using automatically-mined parallel data (Ling et al., 2013). One ma</context>
</contexts>
<marker>Wang, Ng, 2013</marker>
<rawString>Pidong Wang and Hwee Tou Ng. 2013. A beamsearch decoder for normalization of social media text with application to machine translation. In Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL HLT 2013), pages 471–481, Atlanta, USA, June.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Ikuya Yamada</author>
<author>Hideaki Takeda</author>
<author>Takefuji Yoshiyasu</author>
</authors>
<title>Enhancing named entity recognition in twitter messages using entity linking.</title>
<date>2015</date>
<booktitle>In proceedings of WNUT.</booktitle>
<contexts>
<context position="22084" citStr="Yamada et al., 2015" startWordPosition="3450" endWordPosition="3453">verall 56.64 57.52 57.07 Table 5: Precision and recall comparing one annotator against the other. Cohen’s kappa between the annotators was 0.607. Disagreements between the annotators resolved by a 3rd adjudicator for the final datasets. Team ID Affiliation Hallym Hallym University iitp Indian Institute of Technology Patna lattice University Paris 3 multimedialab UGent - iMinds NLANGP Institute for Infocomm Research nrc National Research Council Canada ousia Studio Ousia USFD University of Sheffield Table 6: Team ID and affiliation of the named entity recognition shared task participants. sia (Yamada et al., 2015). All the other teams used CRFs. On top of a CRF, the iitp team used a differential evolution based technique to obtain an optimal feature set. Most systems used the training data as well as both dev sets provided to train their system, except multimedialab which did not use dev2015 as training data and NRC which only used train. 9 Tables 8 and 9 report the results obtained by each team for segmentation and classification of the 10 named entity types and for segmentation only, respectively. 3.4 System Descriptions Following is a brief description of the approach taken by each team: 9A post-com</context>
<context position="25957" citStr="Yamada et al., 2015" startWordPosition="4107" endWordPosition="4110">deled the problem as a sequential labeling task and used Conditional Random Fields. Several post-processing steps (e.g. rulebased matching) were applied to refine the system output. Besides Brown clusters, Kmeans clusters were also used; the K-means clusters were generated based on word embeddings. nrc (Cherry et al., 2015) NRC applied a MIRAtrained semi-Markov tagger with Gazetteer, Brown cluster and Word Embedding features. The Word Embeddings were built over phrases using Word2Vec’s phrase finder tool, and were modified using an auto-encoder to be predictive of Gazetteer membership. ousia (Yamada et al., 2015) The main characteristics of the ousia method is enhancing the performance of Twitter named entity recognition using entity linking. Once entity mentions are disambiguated to the knowledge base entries, high-quality knowledge can be easily extracted from a knowledge base such as the popularity of the entity, the classes of the entity, and the likelihood that the entity appears in the given context. They adopted supervised machine-learning with features including the results of NER and various information of the entity in knowledge bases. We use Stanford NER was used for the NER and in-house en</context>
</contexts>
<marker>Yamada, Takeda, Yoshiyasu, 2015</marker>
<rawString>Ikuya Yamada, Hideaki Takeda, and Takefuji Yoshiyasu. 2015. Enhancing named entity recognition in twitter messages using entity linking. In proceedings of WNUT.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Yi Yang</author>
<author>Jacob Eisenstein</author>
</authors>
<title>A log-linear model for unsupervised text normalization.</title>
<date>2013</date>
<booktitle>In Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing (EMNLP 2013),</booktitle>
<pages>61--72</pages>
<location>Seattle, USA,</location>
<contexts>
<context position="4782" citStr="Yang and Eisenstein, 2013" startWordPosition="714" endWordPosition="718">anslation task, whereby non-standard words are mapped to more canonical expressions (Aw et al., 2006). Other approaches have involved deep learning (Chrupała, 2014), cognitively-inspired approaches (Liu et al., 2012), random walks (Hassan and Menezes, 2013), and supervision using automatically-mined parallel data (Ling et al., 2013). One major challenge in text normalization research has been the lack of annotated data for training and evaluating methods. As a result, most Twitter text normalization methods have been unsupervised or semi-supervised (Cook and Stevenson, 2009; Han et al., 2012; Yang and Eisenstein, 2013), and evaluated over small-scale handannotated datasets. This has hampered analysis of the strengths and weaknesses of individual methods, and was our motivation in organizing the lexical normalization shared task. 2.2 Shared Task Design This lexical normalization shared task is focused exclusively on English, and was designed with three primary desiderata in mind: (1) to construct a much larger dataset than existing resources; (2) to allow all of 1:1, 1:N and N:1 word n-gramm appings; and (3) to cover not just OOV non-standard words but also non-standard words that happen to coincide in spell</context>
</contexts>
<marker>Yang, Eisenstein, 2013</marker>
<rawString>Yi Yang and Jacob Eisenstein. 2013. A log-linear model for unsupervised text normalization. In Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing (EMNLP 2013), pages 61–72, Seattle, USA, October.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Eun-Suk Yang</author>
<author>Yu-Seop Kim</author>
</authors>
<title>Hallym: Named entity recognition on twitter.</title>
<date>2015</date>
<booktitle>In proceedings of WNUT.</booktitle>
<contexts>
<context position="23869" citStr="Yang and Kim, 2015" startWordPosition="3764" endWordPosition="3767">proach taken by each team. Precision Recall F#=1 Precision Recall F#=1 ousia 57.66 55.22 56.41 ousia 72.20 69.14 70.63 NLANGP 63.62 43.12 51.40 NLANGP 67.74 54.31 60.29 nrc 53.24 38.58 44.74 USFD 63.81 56.28 59.81 multimedialab 49.52 39.18 43.75 multimedialab 62.93 55.22 58.82 USFD 45.72 39.64 42.46 nrc 62.13 54.61 58.13 iitp 60.68 29.65 39.84 iitp 63.43 51.44 56.81 Hallym 39.59 35.10 37.21 Hallym 58.36 48.5 53.01 lattice 55.17 9.68 16.47 lattice 58.42 25.72 35.71 BASELINE 35.56 29.05 31.97 BASELINE 53.86 46.44 49.88 Table 8: Results segmenting and categorizing entities into 10 types. Hallym (Yang and Kim, 2015) The Hallym team used an approach based on CRFs using both Brown clusters and word embeddings trained using Canonical Correlation Analysis as features. iitp (Akhtar et al., 2015a) The iitp team proTable 9: Results on segmentation only (no types). posed a multi-objective differential evolution based technique for feature selection in twitter named entity recognition. lattice (Tian, 2015) Lattice employed a CRF model using Wapiti. The feature templates consisted of standard features used in stateof-the-art. They trained first a model with 132 dev 2015 and evaluated this model on train and dev. m</context>
</contexts>
<marker>Yang, Kim, 2015</marker>
<rawString>Eun-Suk Yang and Yu-Seop Kim. 2015. Hallym: Named entity recognition on twitter. In proceedings of WNUT.</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>