<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.029670">
<title confidence="0.997284">
USZEGED: Correction Type-sensitive Normalization of English Tweets
Using Efficiently Indexed n-gram Statistics
</title>
<author confidence="0.99566">
G´abor Berend
</author>
<affiliation confidence="0.9958295">
University of Szeged
Department of Informatics
</affiliation>
<address confidence="0.698345">
´Arp´ad t´er 2., 6720 Szeged, Hungary
</address>
<email confidence="0.995287">
berendg@inf.u-szeged.hu
</email>
<author confidence="0.996891">
Ervin Tasn´adi
</author>
<affiliation confidence="0.995817">
University of Szeged
Department of Informatics
</affiliation>
<address confidence="0.690134">
´Arp´ad t´er 2., 6720 Szeged, Hungary
</address>
<email confidence="0.982778">
Tasnadi.Ervin@stud.u-szeged.hu
</email>
<sectionHeader confidence="0.993244" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.99987765">
This paper describes the framework ap-
plied by team USZEGED at the “Lexical
Normalisation for English Tweets” shared
task. Our approach first employs a CRF-
based sequence labeling framework to de-
cide the kind of corrections the individ-
ual tokens require, then performs the nec-
essary modifications relying on external
lexicons and a massive collection of effi-
ciently indexed n-gram statistics from En-
glish tweets. Our solution is based on
the assumption that from the context of
the OOV words, it is possible to recon-
struct its IV equivalent, as there are users
who use the standard English form of the
OOV word within the same context. Our
approach achieved an F-score of 0.8052,
being the second best one among the un-
constrained submissions, the category our
submission also belongs to.
</bodyText>
<sectionHeader confidence="0.998991" genericHeader="introduction">
1 Introduction
</sectionHeader>
<bodyText confidence="0.999314736842105">
Social media is a rich source of information which
has been proven to be useful to a variety of ap-
plications, such as event extraction (Sakaki et al.,
2010; Ritter et al., 2012; Ritter et al., 2015)
or trend detection, including the tracking of epi-
demics (Lamb et al., 2013). Analyzing tweets
in general, however, can pose several difficulties.
From an engineering point of view, the streaming
nature of tweets requires that special attention is
paid to the scalability of the algorithms applied
and from an NLP point of view, the often sub-
standard characteristics of social media utterances
has to be addressed. The fact that tweets are often
written on mobile devices and are informal makes
the misspelling and abbreviations of words and ex-
pressions, as well as the use of creative informal
language prevalent, giving rise to a higher number
of out-of-vocabulary (OOV) words than in other
genres.
</bodyText>
<sectionHeader confidence="0.999778" genericHeader="related work">
2 Related Work
</sectionHeader>
<bodyText confidence="0.999712378378378">
The informal language of social media, includ-
ing Twitter, is extremely heterogeneous, making
its grammatical analysis more difficult compared
to standard genres such as newswire. It has been
shown previously, that the performance of linguis-
tic analyzers trained on standard text types de-
grade severely once they are applied to texts found
in social media, especially tweets (Ritter et al.,
2011; Derczynski et al., 2013).
In order to build taggers that perform more re-
liable on social media texts, one possible way is
to augment the training data by including texts
originating from social media (Derczynski et al.,
2013). Such approaches, however, require con-
siderable human effort, so one possible alternative
can be to normalize the social media texts first,
then apply standard analyzers on these normalized
texts. Recently, a number of approaches have been
proposed for the lexical normalization of informal
(mostly social media and SMS) texts (Liu et al.,
2011; Liu et al., 2012; Han et al., 2013; Yang and
Eisenstein, 2013).
Han and Baldwin (2011) rely on the identifica-
tion of the words that require correction, then de-
fine a confusion set containing the candidate IV
correction forms for such words. Finally, a rank-
ing scheme, taking multiple factors into consid-
eration, is applied which selects the most likely
correction for an OOV word. In their subsequent
work, Han et al. (2012) propose an automated
method to construct accurate normalization dictio-
naries.
Liu et al. (2011; 2012) propose a character-level
sequence model to predict insertions, deletions
and substitutions. They first collect a large set of
noisy (OOV, IV) training pairs from the Web.
These pairs are then aligned at the character level
</bodyText>
<page confidence="0.944394">
120
</page>
<note confidence="0.786501">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 120–125,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<bodyText confidence="0.999906625">
and provided as training data for a CRF classifier.
The authors also released their 3,802-element nor-
malization dictionary that our work also relies at.
Yang and Eisenstein (2013) introduce an unsu-
pervised log-linear model for the task of text nor-
malization. Besides the features that can be de-
rived from pairs of words (e.g. edit distance), fea-
tures considering the context are also employed in
their model. As the number of class labels in that
model is equal to the size of the IV words an OOV
word could possibly be corrected to (typically on
the order of 104-105, which is far beyond the typ-
ical label size of classification tasks), the authors
propose the use of Sequential Monte Carlo train-
ing approach for learning the appropriate feature
weights.
</bodyText>
<sectionHeader confidence="0.983506" genericHeader="method">
3 The Task of Lexical Normalization
</sectionHeader>
<bodyText confidence="0.999868214285714">
Formally, given an m-long sequence of words in
the ith tweet, Ti = [ti,1, ti,2, . . . , ti,m], partici-
pants of the shared task had to return a sequence
of normalized in-vocabulary (IV) words, i.e. Si =
[si,1, si,2, . . . , si,m]. The training set of the shared
task consisted of 2,950 tweets comprising 44,385
tokens, while the test set had 1,967 tweets which
included a total of 29,421 tokens. According to
the dataset, most of the words did not require any
kind of corrections, i.e. the proportion of unmodi-
fied words was 91.12% and 90.57% for the train-
ing and test set, respectively. Further details with
respect the shared task can be found in the paper
(Baldwin et al., 2015).
As a consequence, we first built a sequence
model to decide which tokens need to be corrected
and in what way. A typical distinction of the cor-
rection types would be based on the number of
tokens a noisy token and its corrected form com-
prises of. According to this approach, one could
distinguish between one-to-one, one-to-many and
many-to-one corrections on the per token basis.
However, instead of applying the above types of
corrections, we identified a more detailed catego-
rization of the correction types and trained a linear
chain CRF utilizing CRFsuite (Okazaki, 2007).
The correction types a token could be classified
as were the following:
</bodyText>
<listItem confidence="0.99792575">
• MissingApos, standing for tokens that only
differ from their corrected version in the ab-
sence of an apostrophe (e.g. youll → you’ll),
• MissingWS, standing for tokens that only
</listItem>
<table confidence="0.999486111111111">
Training Test
MissingApos 507 369
MissingWS 126 76
1to1ED≤2 1,979 1,405
1to1ED&gt;3 413 292
1toMABB 917 634
Subtotal 3,942 2,776
O 40,443 26,645
Total 44,385 29,421
</table>
<tableCaption confidence="0.954335">
Table 1: Distribution of the correction types in the
training and test sets
</tableCaption>
<bodyText confidence="0.392444666666667">
differ from their corrected version in the ab-
sence of one or more whitespace characters
(e.g. whataburger → what a burger),
</bodyText>
<listItem confidence="0.931652142857143">
• 1to1ED≤2, standing for corrections where
no whitespace characters had to be inserted
and the augmented edit distance (introduced
in Section 4.2) between the noisy token
and its normalized form was at most 2
(e.g. tmrw→tomorrow),
• 1to1ED&gt;3, standing for corrections where no
whitespace characters had to be inserted and
the augmented edit distance was at least 3
(e.g. plz→please),
• 1toMABB, standing for corrections where
both whitespace and alphanumeric characters
had to be inserted to obtain a tokens corrected
variant (e.g. lol → laugh out loud).
</listItem>
<bodyText confidence="0.999908625">
For the sake of completeness, we should add that a
further class label (O) was employed. This, how-
ever, corresponded to the case when there was no
correction required to be performed for a token.
As mentioned above, more than 90% of the words
in both the training and test sets belonged to this
category. Table 1 shows the distribution of the cor-
rection types on both the training and test sets.
</bodyText>
<sectionHeader confidence="0.98079" genericHeader="method">
4 Proposed Approach
</sectionHeader>
<bodyText confidence="0.9999875">
Our approach consists of a sequence labeling mod-
ule and relies on lookups from an efficiently in-
dexed n-gram corpus of English tweets. Subse-
quently, we describe the details of these modules.
</bodyText>
<subsectionHeader confidence="0.997015">
4.1 Sequence Labeling for Determining
Correction Types
</subsectionHeader>
<bodyText confidence="0.9994605">
As already mentioned in Section 3, the first com-
ponent in our pipeline was a linear chain CRF
</bodyText>
<page confidence="0.991866">
121
</page>
<bodyText confidence="0.9954368">
(Lafferty et al., 2001). Besides the common word
surface forms, such as the capitalization pattern,
the first letter or character suffixes, we relied on
the following dictionary resources upon determin-
ing the features for the individual words:
</bodyText>
<listItem confidence="0.991875555555556">
• the SCOWL dictionary being part of the
aspell spell checker project containing
canonical English dictionary entries,
• the normalization dictionaries of Han et
al. (2012) and Liu et al. (2012),
• the 5,307-element normalization dictionary
derived from the portal noslang.com,
which map common social media abbrevia-
tions to their complete forms.
</listItem>
<bodyText confidence="0.999773714285714">
For each token, word type features were gener-
ated along with the word types of its neighboring
tokens. The POS tags assigned to each token and
its neighboring tokens by the Twitter POS tagger
(Gimpel et al., 2011) were also utilized as features
in the CRF model. The Twitter POS tag set was
useful to us, as it contains a separate tag (G) for
multi-word abbreviations (e.g. ily for I love you),
which was expected to be highly indicative for the
correction type 1toMABB.
In order to be able to discriminate the
MissingW5 class, we introduced a feature
which indicates for a token t originating from a
tweet whether the relation
</bodyText>
<equation confidence="0.9455665">
max freq1T(s) ≥ τ
sEsplit(t)
</equation>
<bodyText confidence="0.999804111111111">
holds, where τ is a threshold calibrated to 106
based on the training set, freq1T(s) is a function
which returns the frequency value associated with
a string s according to the Google 1T 5-gram cor-
pus and the function split(t) returns the set of all
the possible splits of token t such that its com-
ponents are all contained in the SCOWL dictio-
nary. For instance split(“whataburger&amp;quot;) returns
a set of splits including “what a burger”, “what
a burg er” and “what ab urger”. As there is a
split (i.e. “what a burger”) that is sufficiently fre-
quent according to the n-gram corpus, we take it as
an indication that the original token omitted some
whitespace characters that we need to inserted.
A CRF model with the above feature set was
trained using L-BFGS training method and L1 reg-
ularization using CRFsuite (Okazaki, 2007). The
overall token accuracy this model achieved was
</bodyText>
<table confidence="0.99996725">
Precision Recall F-score
MissingApos 0.9686 0.9744 0.9715
MissingW5 0.8795 0.5794 0.6986
1to1ED&lt;2 0.9078 0.8504 0.8782
1to1ED&gt;3 0.9593 0.6852 0.7994
1toMABB 0.9624 0.8942 0.9271
O 0.9874 0.9959 0.9916
macro average 0.9442 0.8299 0.8777
</table>
<tableCaption confidence="0.9318435">
Table 2: Results of predicting the correction types
for tokens on the training set
</tableCaption>
<table confidence="0.9999795">
Precision Recall F-score
MissingApos 0.9755 0.9702 0.9728
MissingW5 0.7674 0.4342 0.5546
1to1ED&lt;2 0.8619 0.7950 0.8271
1to1ED&gt;3 0.8793 0.5240 0.6567
1toMABB 0.9449 0.8659 0.9037
O 0.9816 0.9932 0.9874
macro average 0.9018 0.7638 0.8171
</table>
<tableCaption confidence="0.998666">
Table 3: Results of predicting the correction types
</tableCaption>
<bodyText confidence="0.9847785">
for tokens on the test set
0.9830 and 0.9746 and the proportion of tweets
for which all the tokens were tagged properly was
0.7902 and 0.7143 for the training and test sets,
respectively. A more detailed breakdown of the
classification performances of the sequence model
on the training and test sets are included in Ta-
ble 2 and Table 3. These tables reveal that the
most difficult error type to identify was the one
where a word missed some whitespace characters
(row MissingW5). This class happens to be the
least frequent and one of the most heterogeneous
class as well, which might be an explanation for
the lower results on that class.
</bodyText>
<subsectionHeader confidence="0.986924">
4.2 Augmented Edit Distance
</subsectionHeader>
<bodyText confidence="0.999937">
When determining a set of candidate IV words
that an OOV might be rewritten for, it is a com-
mon practice to place an upper bound on the edit
distance between the IV candidates and the OOV
word. In order to measure edit distance between
tokens originating from tweets and their corrected
forms, we implemented a modification of the stan-
dard edit distance algorithm that is especially tai-
lored to measuring the difference of OOV tokens
originating from social media to IV ones.
The edit distance we employed is asymmetric
as insertions of characters into OOV tokens have
no costs. For instance, for the words tmrw and to-
</bodyText>
<page confidence="0.995484">
122
</page>
<bodyText confidence="0.999488555555556">
morrow, the edit distance is regarded as 0 if the
former is considered to be the substandard OOV
token and the latter one as the standard IV one.
Note, however, if the role of the two tokens was
changed (i.e if tmrw was treated as IV and tomor-
row as OOV), their edit distance would become 4.
A further relaxation to the standard edit distance
is that we assign 0 cost to the following kinds of
phonetically motivated transcriptions:
</bodyText>
<listItem confidence="0.99222675">
• z -* s located at the end of words (e.g. in
catz -* cats),
• a -* er located at the end of words (e.g. in
bigga -* bigger).
</listItem>
<bodyText confidence="0.999967333333333">
By making the above relaxations to the def-
inition of the standard edit distance, we could
obtain larger candidate sets for a given edit dis-
tance threshold for tokens with higher recall, as we
could reduce the edit distance between the OOV
words and their appropriate IV equivalent in many
cases. Obviously, as the candidate set grows, it
might get increasingly difficult to choose the cor-
rect normalization from it. However, at this stage
of our pipeline, we were more interested in having
the correct IV word in the set of candidate normal-
ization, rather than reducing its size.
</bodyText>
<subsectionHeader confidence="0.999071">
4.3 Making Use of Twitter n-gram Statistics
</subsectionHeader>
<bodyText confidence="0.999976166666667">
Our basic assumption was that from the context
of an OOV word, it is possible to reconstruct its
IV equivalent, as there are users who use the cor-
rect IV English form of the OOV word within the
same context, e.g. see you tomorrow instead of
see u tmrw. The Twitter n-gram frequencies we
made use of were the ones that we aggregated over
the Twitter n-gram corpus augmented with demo-
graphic metadata described in (Herdadelen, 2013).
For a given token ti at position i in a tweet, we
chose the most probable corrected form according
to the formula
</bodyText>
<equation confidence="0.9864685">
arg max P(t1|ti−1)P(ti+1|t&apos;), (1)
t�EC(ti,et(ti))
</equation>
<bodyText confidence="0.99966725">
where the function C(ti, ct(ti)) returns a set of
IV candidates for the token ti, according to ct(ti),
which is the correction type determined for that
token by the sequence model introduced in Sec-
tion 4.1. We indexed the Twitter n-gram corpus
with the highly effective LIT indexer (Ceylan and
Mihalcea, 2011), which made fast queries of the
form ti−1 * ti+1 possible, the symbol * being a
</bodyText>
<table confidence="0.9998826">
Correction Precision Recall F-score
MissingApos 0.9972 0.9972 0.9972
MissingW5 0.8684 0.4177 0.5641
1to1 0.9191 0.9219 0.9205
1toMABB 0.8861 0.9533 0.9185
</table>
<tableCaption confidence="0.9912495">
Table 4: Detailed performance on the different
correction types on the training dataset
</tableCaption>
<table confidence="0.9999636">
Correction Precision Recall F-score
MissingApos 1.0000 0.9841 0.992
MissingW5 0.9737 0.4458 0.6116
1to1 0.9141 0.9127 0.9134
1toMABB 0.8523 0.9699 0.9073
</table>
<tableCaption confidence="0.883946">
Table 5: Detailed performance on the different
correction types on the test dataset
</tableCaption>
<bodyText confidence="0.998732916666667">
placeholder for any token at the given position.
The only case when we did not choose the normal-
ization of an OOV word according to (1) was when
there was a unique suggestion for an IV word in
the normalization dictionaries we listed in Sec-
tion 4.1.
The performance of the normalization on the
training and test sets, according to the correction
types we defined can be found in Table 4 and Ta-
ble 5, respectively. From these tables, one can see
that the worst results were obtained for the correc-
tion type when spaces were required to be inserted
to a OOV word.
This is in accordance with the fact that our se-
quence model obtained the lowest scores exactly
on this kind of corrections. However, due to the
fact that this error category is the least frequent,
the lower scores on that category does not harm
that much our overall performance as can be seen
in Table 6 for both the training and test corpora.
The results shown in Table 6 also illustrate that
our approach seems to generalize well, as there is
a small gap between the performances observed on
the training and test sets of the shared task.
</bodyText>
<table confidence="0.99507">
Training Test
precision 0.8703 0.8606
recall 0.7673 0.7564
F1 0.8156 0.8052
</table>
<tableCaption confidence="0.9964645">
Table 6: Overall performance of our system on the
training and test sets
</tableCaption>
<page confidence="0.998712">
123
</page>
<sectionHeader confidence="0.998575" genericHeader="conclusions">
5 Conclusion
</sectionHeader>
<bodyText confidence="0.999925636363637">
In this paper, we introduced our approach to the
lexical normalization of English tweets that ranked
second at the shared task among the unconstrained
submissions. Our framework first performs se-
quence labeling over the tokens of a tweet to pre-
dict which tokens need to be corrected and in what
way. This step is followed by correction type-
sensitive candidate set generation, from which set
the most likely IV normalization of an OOV word
is selected by querying an efficiently indexed large
n-gram dataset of English tweets.
</bodyText>
<sectionHeader confidence="0.998932" genericHeader="references">
References
</sectionHeader>
<reference confidence="0.999804747368421">
Timothy Baldwin, Marie Catherine de Marneffe,
Bo Han, Young-Bum Kim, Alan Ritter, and Wei
Xu. 2015. Shared tasks of the 2015 workshop on
noisy user-generated text: Twitter lexical normal-
ization and named entity recognition. In Proceed-
ings of the Workshop on Noisy User-generated Text
(WNUT 2015), Beijing, China.
Hakan Ceylan and Rada Mihalcea. 2011. An efficient
indexer for large n-gram corpora. In Proceedings of
the 49th Annual Meeting of the Association for Com-
putational Linguistics: Human Language Technolo-
gies: Systems Demonstrations, HLT ’11, pages 103–
108, Stroudsburg, PA, USA. Association for Com-
putational Linguistics.
Leon Derczynski, Alan Ritter, Sam Clark, and Kalina
Bontcheva. 2013. Twitter part-of-speech tagging
for all: Overcoming sparse and noisy data. In Pro-
ceedings of the International Conference on Recent
Advances in Natural Language Processing. Associ-
ation for Computational Linguistics.
Kevin Gimpel, Nathan Schneider, Brendan O’Connor,
Dipanjan Das, Daniel Mills, Jacob Eisenstein,
Michael Heilman, Dani Yogatama, Jeffrey Flanigan,
and Noah A. Smith. 2011. Part-of-speech tagging
for twitter: Annotation, features, and experiments.
In Proceedings of the 49th Annual Meeting of the
Association for Computational Linguistics: Human
Language Technologies: Short Papers - Volume 2,
HLT ’11, pages 42–47, Stroudsburg, PA, USA. As-
sociation for Computational Linguistics.
Bo Han and Timothy Baldwin. 2011. Lexical normal-
isation of short text messages: Makn sens a #twit-
ter. In Proceedings of the 49th Annual Meeting of
the Association for Computational Linguistics: Hu-
man Language Technologies - Volume 1, HLT ’11,
pages 368–378, Stroudsburg, PA, USA. Association
for Computational Linguistics.
Bo Han, Paul Cook, and Timothy Baldwin. 2012. Au-
tomatically constructing a normalisation dictionary
for microblogs. In Proceedings of the 2012 Joint
Conference on Empirical Methods in Natural Lan-
guage Processing and Computational Natural Lan-
guage Learning (EMNLP-CoNLL 2012), pages 421–
432, Jeju Island, Korea.
Bo Han, Paul Cook, and Timothy Baldwin. 2013. Lex-
ical normalization for social media text. ACM Trans.
Intell. Syst. Technol., 4(1):5:1–5:27, February.
Ama Herdadelen. 2013. Twitter n-gram corpus with
demographic metadata. Language Resources and
Evaluation, 47(4):1127–1147.
John D. Lafferty, Andrew McCallum, and Fernando
C. N. Pereira. 2001. Conditional random fields:
Probabilistic models for segmenting and labeling se-
quence data. In Proceedings of the Eighteenth Inter-
national Conference on Machine Learning, ICML
’01, pages 282–289, San Francisco, CA, USA. Mor-
gan Kaufmann Publishers Inc.
Alex Lamb, Michael J. Paul, and Mark Dredze. 2013.
Separating fact from fear: Tracking flu infections on
twitter. In In NAACL.
Fei Liu, Fuliang Weng, Bingqing Wang, and Yang Liu.
2011. Insertion, Deletion, or Substitution? Nor-
malizing Text Messages without Pre-categorization
nor Supervision. In Proceedings of the 49th Annual
Meeting of the Association for Computational Lin-
guistics: Human Language Technologies, pages 71–
76, Stroudsburg, PA, USA. Association for Compu-
tational Linguistics.
Fei Liu, Fuliang Weng, and Xiao Jiang. 2012. A
Broad-Coverage Normalization System for Social
Media Language. Proceedings of the 50th Annual
Meeting of the Association for Computational Lin-
guistics: Long Papers-Volume 1, pages 1035–1044.
Naoaki Okazaki. 2007. Crfsuite: a fast implementa-
tion of conditional random fields (crfs).
Alan Ritter, Sam Clark, Mausam, and Oren Etzioni.
2011. Named entity recognition in tweets: An ex-
perimental study. In Proceedings of the Conference
on Empirical Methods in Natural Language Pro-
cessing, EMNLP ’11, pages 1524–1534, Strouds-
burg, PA, USA. Association for Computational Lin-
guistics.
Alan Ritter, Mausam, Oren Etzioni, and Sam Clark.
2012. Open domain event extraction from twitter.
In Proceedings of the 18th ACM SIGKDD Inter-
national Conference on Knowledge Discovery and
Data Mining, KDD ’12, pages 1104–1112, New
York, NY, USA. ACM.
Alan Ritter, Evan Wright, William Casey, and Tom
Mitchell. 2015. Weakly supervised extraction of
computer security events from twitter. In Proceed-
ings of the 24th International Conference on World
Wide Web, WWW ’15, pages 896–905, Republic and
Canton of Geneva, Switzerland. International World
Wide Web Conferences Steering Committee.
</reference>
<page confidence="0.982931">
124
</page>
<reference confidence="0.9973088">
Takeshi Sakaki, Makoto Okazaki, and Yutaka Matsuo.
2010. Earthquake shakes twitter users: Real-time
event detection by social sensors. In Proceedings
of the 19th International Conference on World Wide
Web, WWW ’10, pages 851–860, New York, NY,
USA. ACM.
Yi Yang and Jacob Eisenstein. 2013. A Log-Linear
Model for Unsupervised Text Normalization. Pro-
ceedings of the Empirical Methods on Natural Lan-
guage Processing (EMNLP), pages 61–72.
</reference>
<page confidence="0.998441">
125
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.305545">
<title confidence="0.9948395">USZEGED: Correction Type-sensitive Normalization of English Using Efficiently Indexed n-gram Statistics</title>
<author confidence="0.98075">G´abor</author>
<affiliation confidence="0.85075">University of Department of ´Arp´ad t´er 2., 6720 Szeged,</affiliation>
<email confidence="0.995337">berendg@inf.u-szeged.hu</email>
<author confidence="0.858447">Ervin</author>
<affiliation confidence="0.887129">University of Department of ´Arp´ad t´er 2., 6720 Szeged,</affiliation>
<email confidence="0.933342">Tasnadi.Ervin@stud.u-szeged.hu</email>
<abstract confidence="0.998478619047619">This paper describes the framework applied by team USZEGED at the “Lexical Normalisation for English Tweets” shared task. Our approach first employs a CRFbased sequence labeling framework to decide the kind of corrections the individual tokens require, then performs the necessary modifications relying on external lexicons and a massive collection of efficiently indexed n-gram statistics from English tweets. Our solution is based on the assumption that from the context of the OOV words, it is possible to reconstruct its IV equivalent, as there are users who use the standard English form of the OOV word within the same context. Our approach achieved an F-score of 0.8052, being the second best one among the unconstrained submissions, the category our submission also belongs to.</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="true">
<authors>
<author>Timothy Baldwin</author>
<author>Marie Catherine de Marneffe</author>
<author>Bo Han</author>
<author>Young-Bum Kim</author>
<author>Alan Ritter</author>
<author>Wei Xu</author>
</authors>
<title>Shared tasks of the 2015 workshop on noisy user-generated text: Twitter lexical normalization and named entity recognition.</title>
<date>2015</date>
<booktitle>In Proceedings of the Workshop on Noisy User-generated Text (WNUT 2015),</booktitle>
<location>Beijing, China.</location>
<marker>Baldwin, de Marneffe, Han, Kim, Ritter, Xu, 2015</marker>
<rawString>Timothy Baldwin, Marie Catherine de Marneffe, Bo Han, Young-Bum Kim, Alan Ritter, and Wei Xu. 2015. Shared tasks of the 2015 workshop on noisy user-generated text: Twitter lexical normalization and named entity recognition. In Proceedings of the Workshop on Noisy User-generated Text (WNUT 2015), Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Hakan Ceylan</author>
<author>Rada Mihalcea</author>
</authors>
<title>An efficient indexer for large n-gram corpora.</title>
<date>2011</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies: Systems Demonstrations, HLT ’11,</booktitle>
<pages>103--108</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA.</location>
<contexts>
<context position="14063" citStr="Ceylan and Mihalcea, 2011" startWordPosition="2339" endWordPosition="2342">cies we made use of were the ones that we aggregated over the Twitter n-gram corpus augmented with demographic metadata described in (Herdadelen, 2013). For a given token ti at position i in a tweet, we chose the most probable corrected form according to the formula arg max P(t1|ti−1)P(ti+1|t&apos;), (1) t�EC(ti,et(ti)) where the function C(ti, ct(ti)) returns a set of IV candidates for the token ti, according to ct(ti), which is the correction type determined for that token by the sequence model introduced in Section 4.1. We indexed the Twitter n-gram corpus with the highly effective LIT indexer (Ceylan and Mihalcea, 2011), which made fast queries of the form ti−1 * ti+1 possible, the symbol * being a Correction Precision Recall F-score MissingApos 0.9972 0.9972 0.9972 MissingW5 0.8684 0.4177 0.5641 1to1 0.9191 0.9219 0.9205 1toMABB 0.8861 0.9533 0.9185 Table 4: Detailed performance on the different correction types on the training dataset Correction Precision Recall F-score MissingApos 1.0000 0.9841 0.992 MissingW5 0.9737 0.4458 0.6116 1to1 0.9141 0.9127 0.9134 1toMABB 0.8523 0.9699 0.9073 Table 5: Detailed performance on the different correction types on the test dataset placeholder for any token at the given</context>
</contexts>
<marker>Ceylan, Mihalcea, 2011</marker>
<rawString>Hakan Ceylan and Rada Mihalcea. 2011. An efficient indexer for large n-gram corpora. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies: Systems Demonstrations, HLT ’11, pages 103– 108, Stroudsburg, PA, USA. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Leon Derczynski</author>
<author>Alan Ritter</author>
<author>Sam Clark</author>
<author>Kalina Bontcheva</author>
</authors>
<title>Twitter part-of-speech tagging for all: Overcoming sparse and noisy data.</title>
<date>2013</date>
<booktitle>In Proceedings of the International Conference on Recent Advances in Natural Language Processing. Association for Computational Linguistics.</booktitle>
<contexts>
<context position="2509" citStr="Derczynski et al., 2013" startWordPosition="390" endWordPosition="393">nd abbreviations of words and expressions, as well as the use of creative informal language prevalent, giving rise to a higher number of out-of-vocabulary (OOV) words than in other genres. 2 Related Work The informal language of social media, including Twitter, is extremely heterogeneous, making its grammatical analysis more difficult compared to standard genres such as newswire. It has been shown previously, that the performance of linguistic analyzers trained on standard text types degrade severely once they are applied to texts found in social media, especially tweets (Ritter et al., 2011; Derczynski et al., 2013). In order to build taggers that perform more reliable on social media texts, one possible way is to augment the training data by including texts originating from social media (Derczynski et al., 2013). Such approaches, however, require considerable human effort, so one possible alternative can be to normalize the social media texts first, then apply standard analyzers on these normalized texts. Recently, a number of approaches have been proposed for the lexical normalization of informal (mostly social media and SMS) texts (Liu et al., 2011; Liu et al., 2012; Han et al., 2013; Yang and Eisenst</context>
</contexts>
<marker>Derczynski, Ritter, Clark, Bontcheva, 2013</marker>
<rawString>Leon Derczynski, Alan Ritter, Sam Clark, and Kalina Bontcheva. 2013. Twitter part-of-speech tagging for all: Overcoming sparse and noisy data. In Proceedings of the International Conference on Recent Advances in Natural Language Processing. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="false">
<authors>
<author>Kevin Gimpel</author>
<author>Nathan Schneider</author>
<author>Brendan O’Connor</author>
<author>Dipanjan Das</author>
<author>Daniel Mills</author>
<author>Jacob Eisenstein</author>
<author>Michael Heilman</author>
<author>Dani Yogatama</author>
<author>Jeffrey Flanigan</author>
<author>Noah A Smith</author>
</authors>
<title>Part-of-speech tagging for twitter: Annotation, features, and experiments.</title>
<date>2011</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies: Short Papers - Volume 2, HLT ’11,</booktitle>
<pages>42--47</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA.</location>
<marker>Gimpel, Schneider, O’Connor, Das, Mills, Eisenstein, Heilman, Yogatama, Flanigan, Smith, 2011</marker>
<rawString>Kevin Gimpel, Nathan Schneider, Brendan O’Connor, Dipanjan Das, Daniel Mills, Jacob Eisenstein, Michael Heilman, Dani Yogatama, Jeffrey Flanigan, and Noah A. Smith. 2011. Part-of-speech tagging for twitter: Annotation, features, and experiments. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies: Short Papers - Volume 2, HLT ’11, pages 42–47, Stroudsburg, PA, USA. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Bo Han</author>
<author>Timothy Baldwin</author>
</authors>
<title>Lexical normalisation of short text messages: Makn sens a #twitter.</title>
<date>2011</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies - Volume 1, HLT ’11,</booktitle>
<pages>368--378</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA.</location>
<contexts>
<context position="3143" citStr="Han and Baldwin (2011)" startWordPosition="493" endWordPosition="496">o build taggers that perform more reliable on social media texts, one possible way is to augment the training data by including texts originating from social media (Derczynski et al., 2013). Such approaches, however, require considerable human effort, so one possible alternative can be to normalize the social media texts first, then apply standard analyzers on these normalized texts. Recently, a number of approaches have been proposed for the lexical normalization of informal (mostly social media and SMS) texts (Liu et al., 2011; Liu et al., 2012; Han et al., 2013; Yang and Eisenstein, 2013). Han and Baldwin (2011) rely on the identification of the words that require correction, then define a confusion set containing the candidate IV correction forms for such words. Finally, a ranking scheme, taking multiple factors into consideration, is applied which selects the most likely correction for an OOV word. In their subsequent work, Han et al. (2012) propose an automated method to construct accurate normalization dictionaries. Liu et al. (2011; 2012) propose a character-level sequence model to predict insertions, deletions and substitutions. They first collect a large set of noisy (OOV, IV) training pairs f</context>
</contexts>
<marker>Han, Baldwin, 2011</marker>
<rawString>Bo Han and Timothy Baldwin. 2011. Lexical normalisation of short text messages: Makn sens a #twitter. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies - Volume 1, HLT ’11, pages 368–378, Stroudsburg, PA, USA. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Bo Han</author>
<author>Paul Cook</author>
<author>Timothy Baldwin</author>
</authors>
<title>Automatically constructing a normalisation dictionary for microblogs.</title>
<date>2012</date>
<booktitle>In Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning</booktitle>
<pages>421--432</pages>
<location>Jeju Island,</location>
<contexts>
<context position="3481" citStr="Han et al. (2012)" startWordPosition="550" endWordPosition="553">andard analyzers on these normalized texts. Recently, a number of approaches have been proposed for the lexical normalization of informal (mostly social media and SMS) texts (Liu et al., 2011; Liu et al., 2012; Han et al., 2013; Yang and Eisenstein, 2013). Han and Baldwin (2011) rely on the identification of the words that require correction, then define a confusion set containing the candidate IV correction forms for such words. Finally, a ranking scheme, taking multiple factors into consideration, is applied which selects the most likely correction for an OOV word. In their subsequent work, Han et al. (2012) propose an automated method to construct accurate normalization dictionaries. Liu et al. (2011; 2012) propose a character-level sequence model to predict insertions, deletions and substitutions. They first collect a large set of noisy (OOV, IV) training pairs from the Web. These pairs are then aligned at the character level 120 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 120–125, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics and provided as training data for a CRF classifier. The authors also released their 3,802-element normaliza</context>
<context position="8370" citStr="Han et al. (2012)" startWordPosition="1363" endWordPosition="1366">ntly, we describe the details of these modules. 4.1 Sequence Labeling for Determining Correction Types As already mentioned in Section 3, the first component in our pipeline was a linear chain CRF 121 (Lafferty et al., 2001). Besides the common word surface forms, such as the capitalization pattern, the first letter or character suffixes, we relied on the following dictionary resources upon determining the features for the individual words: • the SCOWL dictionary being part of the aspell spell checker project containing canonical English dictionary entries, • the normalization dictionaries of Han et al. (2012) and Liu et al. (2012), • the 5,307-element normalization dictionary derived from the portal noslang.com, which map common social media abbreviations to their complete forms. For each token, word type features were generated along with the word types of its neighboring tokens. The POS tags assigned to each token and its neighboring tokens by the Twitter POS tagger (Gimpel et al., 2011) were also utilized as features in the CRF model. The Twitter POS tag set was useful to us, as it contains a separate tag (G) for multi-word abbreviations (e.g. ily for I love you), which was expected to be highl</context>
</contexts>
<marker>Han, Cook, Baldwin, 2012</marker>
<rawString>Bo Han, Paul Cook, and Timothy Baldwin. 2012. Automatically constructing a normalisation dictionary for microblogs. In Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP-CoNLL 2012), pages 421– 432, Jeju Island, Korea.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Bo Han</author>
<author>Paul Cook</author>
<author>Timothy Baldwin</author>
</authors>
<title>Lexical normalization for social media text.</title>
<date>2013</date>
<journal>ACM Trans. Intell. Syst. Technol.,</journal>
<volume>4</volume>
<issue>1</issue>
<contexts>
<context position="3091" citStr="Han et al., 2013" startWordPosition="485" endWordPosition="488">l., 2011; Derczynski et al., 2013). In order to build taggers that perform more reliable on social media texts, one possible way is to augment the training data by including texts originating from social media (Derczynski et al., 2013). Such approaches, however, require considerable human effort, so one possible alternative can be to normalize the social media texts first, then apply standard analyzers on these normalized texts. Recently, a number of approaches have been proposed for the lexical normalization of informal (mostly social media and SMS) texts (Liu et al., 2011; Liu et al., 2012; Han et al., 2013; Yang and Eisenstein, 2013). Han and Baldwin (2011) rely on the identification of the words that require correction, then define a confusion set containing the candidate IV correction forms for such words. Finally, a ranking scheme, taking multiple factors into consideration, is applied which selects the most likely correction for an OOV word. In their subsequent work, Han et al. (2012) propose an automated method to construct accurate normalization dictionaries. Liu et al. (2011; 2012) propose a character-level sequence model to predict insertions, deletions and substitutions. They first col</context>
</contexts>
<marker>Han, Cook, Baldwin, 2013</marker>
<rawString>Bo Han, Paul Cook, and Timothy Baldwin. 2013. Lexical normalization for social media text. ACM Trans. Intell. Syst. Technol., 4(1):5:1–5:27, February.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Ama Herdadelen</author>
</authors>
<title>Twitter n-gram corpus with demographic metadata.</title>
<date>2013</date>
<journal>Language Resources and Evaluation,</journal>
<volume>47</volume>
<issue>4</issue>
<contexts>
<context position="13588" citStr="Herdadelen, 2013" startWordPosition="2262" endWordPosition="2263">e of our pipeline, we were more interested in having the correct IV word in the set of candidate normalization, rather than reducing its size. 4.3 Making Use of Twitter n-gram Statistics Our basic assumption was that from the context of an OOV word, it is possible to reconstruct its IV equivalent, as there are users who use the correct IV English form of the OOV word within the same context, e.g. see you tomorrow instead of see u tmrw. The Twitter n-gram frequencies we made use of were the ones that we aggregated over the Twitter n-gram corpus augmented with demographic metadata described in (Herdadelen, 2013). For a given token ti at position i in a tweet, we chose the most probable corrected form according to the formula arg max P(t1|ti−1)P(ti+1|t&apos;), (1) t�EC(ti,et(ti)) where the function C(ti, ct(ti)) returns a set of IV candidates for the token ti, according to ct(ti), which is the correction type determined for that token by the sequence model introduced in Section 4.1. We indexed the Twitter n-gram corpus with the highly effective LIT indexer (Ceylan and Mihalcea, 2011), which made fast queries of the form ti−1 * ti+1 possible, the symbol * being a Correction Precision Recall F-score MissingA</context>
</contexts>
<marker>Herdadelen, 2013</marker>
<rawString>Ama Herdadelen. 2013. Twitter n-gram corpus with demographic metadata. Language Resources and Evaluation, 47(4):1127–1147.</rawString>
</citation>
<citation valid="true">
<authors>
<author>John D Lafferty</author>
<author>Andrew McCallum</author>
<author>Fernando C N Pereira</author>
</authors>
<title>Conditional random fields: Probabilistic models for segmenting and labeling sequence data.</title>
<date>2001</date>
<booktitle>In Proceedings of the Eighteenth International Conference on Machine Learning, ICML ’01,</booktitle>
<pages>282--289</pages>
<publisher>Morgan Kaufmann Publishers Inc.</publisher>
<location>San Francisco, CA, USA.</location>
<contexts>
<context position="7977" citStr="Lafferty et al., 2001" startWordPosition="1304" endWordPosition="1307">uired to be performed for a token. As mentioned above, more than 90% of the words in both the training and test sets belonged to this category. Table 1 shows the distribution of the correction types on both the training and test sets. 4 Proposed Approach Our approach consists of a sequence labeling module and relies on lookups from an efficiently indexed n-gram corpus of English tweets. Subsequently, we describe the details of these modules. 4.1 Sequence Labeling for Determining Correction Types As already mentioned in Section 3, the first component in our pipeline was a linear chain CRF 121 (Lafferty et al., 2001). Besides the common word surface forms, such as the capitalization pattern, the first letter or character suffixes, we relied on the following dictionary resources upon determining the features for the individual words: • the SCOWL dictionary being part of the aspell spell checker project containing canonical English dictionary entries, • the normalization dictionaries of Han et al. (2012) and Liu et al. (2012), • the 5,307-element normalization dictionary derived from the portal noslang.com, which map common social media abbreviations to their complete forms. For each token, word type featur</context>
</contexts>
<marker>Lafferty, McCallum, Pereira, 2001</marker>
<rawString>John D. Lafferty, Andrew McCallum, and Fernando C. N. Pereira. 2001. Conditional random fields: Probabilistic models for segmenting and labeling sequence data. In Proceedings of the Eighteenth International Conference on Machine Learning, ICML ’01, pages 282–289, San Francisco, CA, USA. Morgan Kaufmann Publishers Inc.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Alex Lamb</author>
<author>Michael J Paul</author>
<author>Mark Dredze</author>
</authors>
<title>Separating fact from fear: Tracking flu infections on twitter. In</title>
<date>2013</date>
<booktitle>In NAACL.</booktitle>
<contexts>
<context position="1449" citStr="Lamb et al., 2013" startWordPosition="222" endWordPosition="225">rom the context of the OOV words, it is possible to reconstruct its IV equivalent, as there are users who use the standard English form of the OOV word within the same context. Our approach achieved an F-score of 0.8052, being the second best one among the unconstrained submissions, the category our submission also belongs to. 1 Introduction Social media is a rich source of information which has been proven to be useful to a variety of applications, such as event extraction (Sakaki et al., 2010; Ritter et al., 2012; Ritter et al., 2015) or trend detection, including the tracking of epidemics (Lamb et al., 2013). Analyzing tweets in general, however, can pose several difficulties. From an engineering point of view, the streaming nature of tweets requires that special attention is paid to the scalability of the algorithms applied and from an NLP point of view, the often substandard characteristics of social media utterances has to be addressed. The fact that tweets are often written on mobile devices and are informal makes the misspelling and abbreviations of words and expressions, as well as the use of creative informal language prevalent, giving rise to a higher number of out-of-vocabulary (OOV) wor</context>
</contexts>
<marker>Lamb, Paul, Dredze, 2013</marker>
<rawString>Alex Lamb, Michael J. Paul, and Mark Dredze. 2013. Separating fact from fear: Tracking flu infections on twitter. In In NAACL.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Fei Liu</author>
<author>Fuliang Weng</author>
<author>Bingqing Wang</author>
<author>Yang Liu</author>
</authors>
<title>Insertion, Deletion, or Substitution? Normalizing Text Messages without Pre-categorization nor Supervision.</title>
<date>2011</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies,</booktitle>
<pages>71--76</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA.</location>
<contexts>
<context position="3055" citStr="Liu et al., 2011" startWordPosition="477" endWordPosition="480">edia, especially tweets (Ritter et al., 2011; Derczynski et al., 2013). In order to build taggers that perform more reliable on social media texts, one possible way is to augment the training data by including texts originating from social media (Derczynski et al., 2013). Such approaches, however, require considerable human effort, so one possible alternative can be to normalize the social media texts first, then apply standard analyzers on these normalized texts. Recently, a number of approaches have been proposed for the lexical normalization of informal (mostly social media and SMS) texts (Liu et al., 2011; Liu et al., 2012; Han et al., 2013; Yang and Eisenstein, 2013). Han and Baldwin (2011) rely on the identification of the words that require correction, then define a confusion set containing the candidate IV correction forms for such words. Finally, a ranking scheme, taking multiple factors into consideration, is applied which selects the most likely correction for an OOV word. In their subsequent work, Han et al. (2012) propose an automated method to construct accurate normalization dictionaries. Liu et al. (2011; 2012) propose a character-level sequence model to predict insertions, deletio</context>
</contexts>
<marker>Liu, Weng, Wang, Liu, 2011</marker>
<rawString>Fei Liu, Fuliang Weng, Bingqing Wang, and Yang Liu. 2011. Insertion, Deletion, or Substitution? Normalizing Text Messages without Pre-categorization nor Supervision. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, pages 71– 76, Stroudsburg, PA, USA. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Fei Liu</author>
<author>Fuliang Weng</author>
<author>Xiao Jiang</author>
</authors>
<title>A Broad-Coverage Normalization System for Social Media Language.</title>
<date>2012</date>
<booktitle>Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics: Long Papers-Volume</booktitle>
<volume>1</volume>
<pages>1035--1044</pages>
<contexts>
<context position="3073" citStr="Liu et al., 2012" startWordPosition="481" endWordPosition="484">weets (Ritter et al., 2011; Derczynski et al., 2013). In order to build taggers that perform more reliable on social media texts, one possible way is to augment the training data by including texts originating from social media (Derczynski et al., 2013). Such approaches, however, require considerable human effort, so one possible alternative can be to normalize the social media texts first, then apply standard analyzers on these normalized texts. Recently, a number of approaches have been proposed for the lexical normalization of informal (mostly social media and SMS) texts (Liu et al., 2011; Liu et al., 2012; Han et al., 2013; Yang and Eisenstein, 2013). Han and Baldwin (2011) rely on the identification of the words that require correction, then define a confusion set containing the candidate IV correction forms for such words. Finally, a ranking scheme, taking multiple factors into consideration, is applied which selects the most likely correction for an OOV word. In their subsequent work, Han et al. (2012) propose an automated method to construct accurate normalization dictionaries. Liu et al. (2011; 2012) propose a character-level sequence model to predict insertions, deletions and substitutio</context>
<context position="8392" citStr="Liu et al. (2012)" startWordPosition="1368" endWordPosition="1371">details of these modules. 4.1 Sequence Labeling for Determining Correction Types As already mentioned in Section 3, the first component in our pipeline was a linear chain CRF 121 (Lafferty et al., 2001). Besides the common word surface forms, such as the capitalization pattern, the first letter or character suffixes, we relied on the following dictionary resources upon determining the features for the individual words: • the SCOWL dictionary being part of the aspell spell checker project containing canonical English dictionary entries, • the normalization dictionaries of Han et al. (2012) and Liu et al. (2012), • the 5,307-element normalization dictionary derived from the portal noslang.com, which map common social media abbreviations to their complete forms. For each token, word type features were generated along with the word types of its neighboring tokens. The POS tags assigned to each token and its neighboring tokens by the Twitter POS tagger (Gimpel et al., 2011) were also utilized as features in the CRF model. The Twitter POS tag set was useful to us, as it contains a separate tag (G) for multi-word abbreviations (e.g. ily for I love you), which was expected to be highly indicative for the c</context>
</contexts>
<marker>Liu, Weng, Jiang, 2012</marker>
<rawString>Fei Liu, Fuliang Weng, and Xiao Jiang. 2012. A Broad-Coverage Normalization System for Social Media Language. Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics: Long Papers-Volume 1, pages 1035–1044.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Naoaki Okazaki</author>
</authors>
<title>Crfsuite: a fast implementation of conditional random fields (crfs).</title>
<date>2007</date>
<contexts>
<context position="6024" citStr="Okazaki, 2007" startWordPosition="980" endWordPosition="981"> in the paper (Baldwin et al., 2015). As a consequence, we first built a sequence model to decide which tokens need to be corrected and in what way. A typical distinction of the correction types would be based on the number of tokens a noisy token and its corrected form comprises of. According to this approach, one could distinguish between one-to-one, one-to-many and many-to-one corrections on the per token basis. However, instead of applying the above types of corrections, we identified a more detailed categorization of the correction types and trained a linear chain CRF utilizing CRFsuite (Okazaki, 2007). The correction types a token could be classified as were the following: • MissingApos, standing for tokens that only differ from their corrected version in the absence of an apostrophe (e.g. youll → you’ll), • MissingWS, standing for tokens that only Training Test MissingApos 507 369 MissingWS 126 76 1to1ED≤2 1,979 1,405 1to1ED&gt;3 413 292 1toMABB 917 634 Subtotal 3,942 2,776 O 40,443 26,645 Total 44,385 29,421 Table 1: Distribution of the correction types in the training and test sets differ from their corrected version in the absence of one or more whitespace characters (e.g. whataburger → w</context>
<context position="10024" citStr="Okazaki, 2007" startWordPosition="1651" endWordPosition="1652"> function split(t) returns the set of all the possible splits of token t such that its components are all contained in the SCOWL dictionary. For instance split(“whataburger&amp;quot;) returns a set of splits including “what a burger”, “what a burg er” and “what ab urger”. As there is a split (i.e. “what a burger”) that is sufficiently frequent according to the n-gram corpus, we take it as an indication that the original token omitted some whitespace characters that we need to inserted. A CRF model with the above feature set was trained using L-BFGS training method and L1 regularization using CRFsuite (Okazaki, 2007). The overall token accuracy this model achieved was Precision Recall F-score MissingApos 0.9686 0.9744 0.9715 MissingW5 0.8795 0.5794 0.6986 1to1ED&lt;2 0.9078 0.8504 0.8782 1to1ED&gt;3 0.9593 0.6852 0.7994 1toMABB 0.9624 0.8942 0.9271 O 0.9874 0.9959 0.9916 macro average 0.9442 0.8299 0.8777 Table 2: Results of predicting the correction types for tokens on the training set Precision Recall F-score MissingApos 0.9755 0.9702 0.9728 MissingW5 0.7674 0.4342 0.5546 1to1ED&lt;2 0.8619 0.7950 0.8271 1to1ED&gt;3 0.8793 0.5240 0.6567 1toMABB 0.9449 0.8659 0.9037 O 0.9816 0.9932 0.9874 macro average 0.9018 0.7638</context>
</contexts>
<marker>Okazaki, 2007</marker>
<rawString>Naoaki Okazaki. 2007. Crfsuite: a fast implementation of conditional random fields (crfs).</rawString>
</citation>
<citation valid="true">
<authors>
<author>Alan Ritter</author>
<author>Sam Clark</author>
<author>Mausam</author>
<author>Oren Etzioni</author>
</authors>
<title>Named entity recognition in tweets: An experimental study.</title>
<date>2011</date>
<booktitle>In Proceedings of the Conference on Empirical Methods in Natural Language Processing, EMNLP ’11,</booktitle>
<pages>1524--1534</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA.</location>
<contexts>
<context position="2483" citStr="Ritter et al., 2011" startWordPosition="386" endWordPosition="389">kes the misspelling and abbreviations of words and expressions, as well as the use of creative informal language prevalent, giving rise to a higher number of out-of-vocabulary (OOV) words than in other genres. 2 Related Work The informal language of social media, including Twitter, is extremely heterogeneous, making its grammatical analysis more difficult compared to standard genres such as newswire. It has been shown previously, that the performance of linguistic analyzers trained on standard text types degrade severely once they are applied to texts found in social media, especially tweets (Ritter et al., 2011; Derczynski et al., 2013). In order to build taggers that perform more reliable on social media texts, one possible way is to augment the training data by including texts originating from social media (Derczynski et al., 2013). Such approaches, however, require considerable human effort, so one possible alternative can be to normalize the social media texts first, then apply standard analyzers on these normalized texts. Recently, a number of approaches have been proposed for the lexical normalization of informal (mostly social media and SMS) texts (Liu et al., 2011; Liu et al., 2012; Han et a</context>
</contexts>
<marker>Ritter, Clark, Mausam, Etzioni, 2011</marker>
<rawString>Alan Ritter, Sam Clark, Mausam, and Oren Etzioni. 2011. Named entity recognition in tweets: An experimental study. In Proceedings of the Conference on Empirical Methods in Natural Language Processing, EMNLP ’11, pages 1524–1534, Stroudsburg, PA, USA. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Alan Ritter</author>
<author>Oren Etzioni Mausam</author>
<author>Sam Clark</author>
</authors>
<title>Open domain event extraction from twitter.</title>
<date>2012</date>
<booktitle>In Proceedings of the 18th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, KDD ’12,</booktitle>
<pages>1104--1112</pages>
<publisher>ACM.</publisher>
<location>New York, NY, USA.</location>
<contexts>
<context position="1351" citStr="Ritter et al., 2012" startWordPosition="205" endWordPosition="208">ently indexed n-gram statistics from English tweets. Our solution is based on the assumption that from the context of the OOV words, it is possible to reconstruct its IV equivalent, as there are users who use the standard English form of the OOV word within the same context. Our approach achieved an F-score of 0.8052, being the second best one among the unconstrained submissions, the category our submission also belongs to. 1 Introduction Social media is a rich source of information which has been proven to be useful to a variety of applications, such as event extraction (Sakaki et al., 2010; Ritter et al., 2012; Ritter et al., 2015) or trend detection, including the tracking of epidemics (Lamb et al., 2013). Analyzing tweets in general, however, can pose several difficulties. From an engineering point of view, the streaming nature of tweets requires that special attention is paid to the scalability of the algorithms applied and from an NLP point of view, the often substandard characteristics of social media utterances has to be addressed. The fact that tweets are often written on mobile devices and are informal makes the misspelling and abbreviations of words and expressions, as well as the use of c</context>
</contexts>
<marker>Ritter, Mausam, Clark, 2012</marker>
<rawString>Alan Ritter, Mausam, Oren Etzioni, and Sam Clark. 2012. Open domain event extraction from twitter. In Proceedings of the 18th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, KDD ’12, pages 1104–1112, New York, NY, USA. ACM.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Alan Ritter</author>
<author>Evan Wright</author>
<author>William Casey</author>
<author>Tom Mitchell</author>
</authors>
<title>Weakly supervised extraction of computer security events from twitter.</title>
<date>2015</date>
<booktitle>In Proceedings of the 24th International Conference on World Wide Web, WWW ’15,</booktitle>
<pages>896--905</pages>
<institution>Republic and Canton of Geneva, Switzerland. International World Wide Web Conferences Steering Committee.</institution>
<contexts>
<context position="1373" citStr="Ritter et al., 2015" startWordPosition="209" endWordPosition="212">statistics from English tweets. Our solution is based on the assumption that from the context of the OOV words, it is possible to reconstruct its IV equivalent, as there are users who use the standard English form of the OOV word within the same context. Our approach achieved an F-score of 0.8052, being the second best one among the unconstrained submissions, the category our submission also belongs to. 1 Introduction Social media is a rich source of information which has been proven to be useful to a variety of applications, such as event extraction (Sakaki et al., 2010; Ritter et al., 2012; Ritter et al., 2015) or trend detection, including the tracking of epidemics (Lamb et al., 2013). Analyzing tweets in general, however, can pose several difficulties. From an engineering point of view, the streaming nature of tweets requires that special attention is paid to the scalability of the algorithms applied and from an NLP point of view, the often substandard characteristics of social media utterances has to be addressed. The fact that tweets are often written on mobile devices and are informal makes the misspelling and abbreviations of words and expressions, as well as the use of creative informal langu</context>
</contexts>
<marker>Ritter, Wright, Casey, Mitchell, 2015</marker>
<rawString>Alan Ritter, Evan Wright, William Casey, and Tom Mitchell. 2015. Weakly supervised extraction of computer security events from twitter. In Proceedings of the 24th International Conference on World Wide Web, WWW ’15, pages 896–905, Republic and Canton of Geneva, Switzerland. International World Wide Web Conferences Steering Committee.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Takeshi Sakaki</author>
<author>Makoto Okazaki</author>
<author>Yutaka Matsuo</author>
</authors>
<title>Earthquake shakes twitter users: Real-time event detection by social sensors.</title>
<date>2010</date>
<booktitle>In Proceedings of the 19th International Conference on World Wide Web, WWW ’10,</booktitle>
<pages>851--860</pages>
<publisher>ACM.</publisher>
<location>New York, NY, USA.</location>
<contexts>
<context position="1330" citStr="Sakaki et al., 2010" startWordPosition="201" endWordPosition="204"> collection of efficiently indexed n-gram statistics from English tweets. Our solution is based on the assumption that from the context of the OOV words, it is possible to reconstruct its IV equivalent, as there are users who use the standard English form of the OOV word within the same context. Our approach achieved an F-score of 0.8052, being the second best one among the unconstrained submissions, the category our submission also belongs to. 1 Introduction Social media is a rich source of information which has been proven to be useful to a variety of applications, such as event extraction (Sakaki et al., 2010; Ritter et al., 2012; Ritter et al., 2015) or trend detection, including the tracking of epidemics (Lamb et al., 2013). Analyzing tweets in general, however, can pose several difficulties. From an engineering point of view, the streaming nature of tweets requires that special attention is paid to the scalability of the algorithms applied and from an NLP point of view, the often substandard characteristics of social media utterances has to be addressed. The fact that tweets are often written on mobile devices and are informal makes the misspelling and abbreviations of words and expressions, as</context>
</contexts>
<marker>Sakaki, Okazaki, Matsuo, 2010</marker>
<rawString>Takeshi Sakaki, Makoto Okazaki, and Yutaka Matsuo. 2010. Earthquake shakes twitter users: Real-time event detection by social sensors. In Proceedings of the 19th International Conference on World Wide Web, WWW ’10, pages 851–860, New York, NY, USA. ACM.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Yi Yang</author>
<author>Jacob Eisenstein</author>
</authors>
<title>A Log-Linear Model for Unsupervised Text Normalization.</title>
<date>2013</date>
<booktitle>Proceedings of the Empirical Methods on Natural Language Processing (EMNLP),</booktitle>
<pages>61--72</pages>
<contexts>
<context position="3119" citStr="Yang and Eisenstein, 2013" startWordPosition="489" endWordPosition="492">ki et al., 2013). In order to build taggers that perform more reliable on social media texts, one possible way is to augment the training data by including texts originating from social media (Derczynski et al., 2013). Such approaches, however, require considerable human effort, so one possible alternative can be to normalize the social media texts first, then apply standard analyzers on these normalized texts. Recently, a number of approaches have been proposed for the lexical normalization of informal (mostly social media and SMS) texts (Liu et al., 2011; Liu et al., 2012; Han et al., 2013; Yang and Eisenstein, 2013). Han and Baldwin (2011) rely on the identification of the words that require correction, then define a confusion set containing the candidate IV correction forms for such words. Finally, a ranking scheme, taking multiple factors into consideration, is applied which selects the most likely correction for an OOV word. In their subsequent work, Han et al. (2012) propose an automated method to construct accurate normalization dictionaries. Liu et al. (2011; 2012) propose a character-level sequence model to predict insertions, deletions and substitutions. They first collect a large set of noisy (O</context>
</contexts>
<marker>Yang, Eisenstein, 2013</marker>
<rawString>Yi Yang and Jacob Eisenstein. 2013. A Log-Linear Model for Unsupervised Text Normalization. Proceedings of the Empirical Methods on Natural Language Processing (EMNLP), pages 61–72.</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>