<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.021697">
<title confidence="0.96823">
Bekli: A Simple Approach to Twitter Text Normalization
</title>
<author confidence="0.903396">
Russell Beckley
</author>
<affiliation confidence="0.818608">
Oregon Health and Sciences University
</affiliation>
<address confidence="0.713368">
Portland, Oregon
</address>
<email confidence="0.99827">
beckleyr@ohsu.edu
</email>
<sectionHeader confidence="0.99389" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.999567166666667">
Every day, Twitter users generate vast quan-
tities of potentially useful information in the
form of written language. Due to Twitter’s fre-
quently informal tone, text normalization can
be a crucial element for exploiting that infor-
mation. This paper outlines our approach to
text normalization used in the WNUT shared
task. We show that a very simple solu-
tion, powered by a modestly sized, partially-
curated wordlist—combined with a modest re-
ranking scheme—can deliver respectable re-
sults.
</bodyText>
<sectionHeader confidence="0.998801" genericHeader="introduction">
1 Introduction
</sectionHeader>
<bodyText confidence="0.999151473684211">
Twitter is an immense, living collection of written
language from all over the world. Every day, Twit-
ter publishes a staggering 500 million tweets1. The
content of Twitter is virtually unlimited, and has
proven useful for much research, including epidemi-
ology: Chew and Eysenbach (2010); and sentiment
analysis: Barbosa and Feng (2010), Bakliwal et al.
(2013), Rosenthal et al. (2015), Li et al. (2014).
It would take many readers to keep up with Twit-
ter’s output, but, fortunately, we have natural lan-
guage processing (NLP) methods that can automat-
ically filter, condense, or extract information from
text. However, NLP approaches are typically trained
on formal edited text, and struggle with the infor-
mal, unedited text of Twitter. But there is a well-
known way to mitigate this problem: text normal-
ization, i.e. replacing non-standard tokens with their
standard equivalents, yielding text that will be more
agreeable to NLP.
</bodyText>
<footnote confidence="0.5707685">
1https://blog.twitter.com/2013/new-tweets-per-second-
record-and-how
</footnote>
<bodyText confidence="0.999846571428572">
One flavor of non-standard writing—what I have
previously focused on—is what I call “vernacular
orthography” (VO). VO is spelling that indicates
itentional non-standard pronunciation, such as when
the string “dat” stands in for “that”. While nu-
merous papers offer solutions for text normalization
(e.g. Han and Baldwin (2011), Yang and Eisenstein
(2013), Zhang et al. (2013), Sproat et al. (2001),
Li and Liu (2014)), and a few build models based
on phonemic similarity (e.g. Kobus et al. (2008),
Choudhury et al. (2007)), none to our knowledge
have addressed VO in particular. This paper, too, ad-
dresses the general normalization problem, but uses
lessons learned attempting to normalize VO.
</bodyText>
<sectionHeader confidence="0.930925" genericHeader="method">
2 System Architecture
</sectionHeader>
<bodyText confidence="0.999805">
The architecture of this system is very simple, con-
sisting of three main parts: (1) a substitution list, (2)
a couple of rule based components, and (3) a sen-
tence level re-ranker. This provides for a fast per-
token performance.
</bodyText>
<subsectionHeader confidence="0.998392">
2.1 Substitution List
</subsectionHeader>
<bodyText confidence="0.999957153846154">
Most of the work is done by a semi-supervised sub-
stitution list consisting of ordered pairs. The first
member of each pair is a string representing a non-
standard word. The second member of each pair is
a list of strings representing candidate replacements
for the word. For example, one pair is (“n”, (“and,
in”)). There are just over 45,000 pairs, where only
the first 2000 are hand-curated.
To create the list, we use a collection of tweets
(see “Resources Employed” section) and a derived
dictionary, described presently. The dictionary has
the 18,000 standard words most frequent in the tweet
collection. For the construction of the dictionary, a
</bodyText>
<page confidence="0.992964">
82
</page>
<note confidence="0.789276">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 82–86,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<bodyText confidence="0.999878695652174">
word is considered to be standard if it has at least
four characters, and is found in the CMU pronounc-
ing dictionary, or, if it has fewer than three charac-
ters, it is found in the Norvig dictionary2 and is suffi-
ciently frequent (where sufficient frequency depends
on the word length).
Now we want to find the most frequent OOVs that
need to be normalized. We tokenize the twitter set
and filter out all tokens that appear in our dictionary.
We also filter out all tokens that do not match the
format of normalizable tokens as specified by the
shared task e.g. tokens that have non-alphanumeric
characters other than an apostrophe(’). Lastly, we
filter out those tokens that could be normalized by
our rule-based components (described in next sub-
section).
We count the occurrences of each OOV-candidate
token type, sort by the count, and return the resulting
list. This puts the most useful candidates first and
provides for efficient use of annotation time. Suit-
able replacements require human judgement and oc-
casional reference to outside sources. The outside
sources were (1) Urban Dictionary, which is very
useful for slang and acronyms (2) Twitter, which
tells you how a word is most often used on Twit-
ter, and (3) the training set provided for the shared
task, which tells you how to normalize in ambigu-
ous cases (e.g. “laughing out loud” v. “laugh out
loud”).2
In addition to these hand-curated entries, we
added the Lexical normalization dictionaries,
UniMelb and UTDallas, provided for the shared
task. From these lists we took all entries not already
in the hand-curated list.2
With this initial list in place, we ran it on the train-
ing set and analyzed the errors, looking specifically
at false positives and false negatives. We sorted the
tokens that caused these errors according to a for-
mula that estimated what change would occur to the
f-score if the token was to be removed or added to
the list. If the token represented a false negative, we
would estimate the change to f-score we would get
by adding it to the list, assuming that it’s substitu-
tion would always be the word most often associated
with it in the training set. If it was false positive, we
would estimate the change to f-score we would get
</bodyText>
<subsectionHeader confidence="0.44046">
2See “Resources Employed” section.
</subsectionHeader>
<bodyText confidence="0.999807172413793">
by deleting it from the list.
This analysis revealed some weak spots in the list.
First, there were a number of false positives caused
by differing beliefs regarding what counts as non-
standard. For example, there are several contractions
(e.g. “gonna”, “gotta”, “wanna”, and “ain’t”) that
are not usually considered standard (rarely seen in
The Wall Street Journal), and have straight forward
normalizations, that are nonetheless considered to
be in-vocabulary in the task. These words were re-
moved from the substitution list, and added to a new
list—a “do-not-normalize” list.
Furthermore, there were of course a number of
false negatives. Many of these come from tokens
that are in the dictionary, but that are often used in a
non-standard way in informal speech. For example,
“wit” is in standard dictionaries, referring to an in-
tellectual feature; however it often appears in Twit-
ter as a non-standard variation of “with”, as in “you
wit me hea?” Likewise on Twitter, “cause” almost
always means “because”. Such tokens were added
to the substitution list.
It might be supposed that using the training set in
this way could lead to severe over-fitting. To avoid
this, we didn’t make any adjustments for tokens ap-
pearing less than three times as a false positive, true
positive, or false negative. The results show that any
over-fitting was not severe, since the test f-score was
just one point less than the training f-score.
</bodyText>
<subsectionHeader confidence="0.989832">
2.2 Rule-based components
</subsectionHeader>
<bodyText confidence="0.999901352941177">
We also experimented with several rule-based com-
ponents, two of which— because they applied in the
greatest number of cases in the training set—were
used in the final system. These components were
the “ing” rule and the “coool” rule.
The “ing” rule looks for cases in which the ver-
bal suffix “-ing” is altered to an “-in”, “-en”, or “-n”,
such as when “busting” becomes “bustin”. If the test
token is in the dictionary, the component generates
no candidates. If the token is not in the dictionary,
the component checks if the word ends with “-in”,
“-en”, or “-n” proceeded by certain consonants, and
if so, checks for the likelihood of additional sylla-
bles. If those conditions hold, it replaces the iden-
tified ending with “ing”, and if the result is in the
dictionary, it becomes a candidate.
The “coool” rule attempts to normalize text that,
</bodyText>
<page confidence="0.994706">
83
</page>
<bodyText confidence="0.999987117647059">
for emphasis, repeats characters, as in “Thaatt iss
reallyyyyy neeeeat!” To generate candidates, the
“coool” rule finds every run of more than two re-
peated characters and reduces the length of the run
to two. For every one of these runs, we assume that
the original had either one or two of that character in
that place. We consider every string that can be cre-
ated by reducing a subset of the two-character runs
to one character each, and return only those strings
that occur in the dictionary. For example, if the orig-
inal token is “thaatt”, we consider “thaatt”, “thaat”,
“thatt”, and “that”, but return only “that”, being in
the dictionary.
When the system is run with only the “ing” and
“coool” rules, plus the sentence level re-ranker, we
get a precision of .81 and a recall of .09. However,
when combined with the rest of the system, it’s con-
tribution is insignificant. It seems that the most fre-
quent instances of these rules are already in the sub-
stitution list, so the rules do not generate enough true
positives to offset their generated false positives.
Along with “ing” and “coool”, we tried a num-
ber of similar rule-based components. For example,
we looked for cases where “th” is replaced by ei-
ther “d”, “f”, or “t”. Another example is the “double
consonant” rule, based on the idea that when a word
ends with two consonants, and both are voiced or
both are unvoiced, the second consonant is dropped.
For example “wrist” becomes “wris”. These are
widespread phenomena, but not widespread enough
on Twitter for the true positives to outweigh the false
positives. A more sensitive rule or a better sentence-
level re-ranker would be needed to make these com-
ponents beneficial.
</bodyText>
<subsectionHeader confidence="0.998937">
2.3 Sentence Level Reranker
</subsectionHeader>
<bodyText confidence="0.991062942307693">
The third major component of the system is the sen-
tence level-re-ranker. This is, in short, a bigram
Viterbi algorithm.
Bigrams were collected from our set of ten mil-
lion tweets. Bigrams with any out-of-vocabulary to-
kens were ignored. From this set, for each bigram
(t1, t2), we computed prob(t2|t1) with Laplacian
smoothing. These became the transition probabili-
ties in our Viterbi problem.
At test time, we generated candidates for each
token. If the substitution list had an entry for an
original token, all suggested substitutions in that en-
try became candidates. For each of these candi-
dates, c, we initialize a weight, w,, where w, =
2 × (rank(c) + 1), and rank(c) refers to c’s po-
sition in the list for the current token’s entry in the
substitution list. If the “ing” rule or the “coool” rule
generated answers, those would also be candidates.
For the “cool” rule, the weight was the number of
deletions required to get the candidate from the orig-
inal token. For the “ing” rule, the weight was, some-
what arbitrarily, 1. Finally, the original token is a
candidate with a weight of 0. These weights were
the emission weights for Viterbi, and were treated as
− log(prob(c)). For each original token in the test
set, we generated on average .04 other candidates.
The system then constructed a lattice from the
tweet and all of its normalization candidates. With
Viterbi dynamic programming it found the maxi-
mum probability path through the lattice. Words in
the maximal path were taken to be the correct word
for the corresponding token.
At the time of the shared task, I compared this ap-
proach to a simpler approach, in which, for any orig-
inal token, the system ignored the context and se-
lected the normalization with the greatest emission
score (emission score as defined above). At first,
the Viterbi method added 10 percentage points f1
over this method. However, after the the shared task
was finished, I discovered that the greatest-emission
method had an error. Having fixed that error, and re-
running the system on the training data, I discovered
that this greatest-emission rule, on the training data,
gives better results than the Viterbi system used for
the shared task: for f1 scores, the Viterbi approach
gets a .768 while the greatest-emission score is .816.
Note that, for the Viterbi approach, the test score,
.757, is not much less than the training score. In
summary, (1) the Viterbi approach, as implemented,
is probably not the best, and (2) the overall normal-
ization approach I describe in this paper is probably
better than the shared task results suggest.
</bodyText>
<sectionHeader confidence="0.990863" genericHeader="method">
3 Resources Employed
</sectionHeader>
<bodyText confidence="0.9976446">
The computations for training and testing were done
on a MacBook. Required computational resources
were minimal. Data resources are as follows:
A. CMU pronouncing dictionary: “an open-
source machine-readable pronunciation dictio-
</bodyText>
<page confidence="0.996478">
84
</page>
<table confidence="0.999948333333333">
Team Name precision recall f1
IHS RD 0.8469 0.8083 0.8272
USZEGED 0.8606 0.7564 0.8052
bekli 0.7732 0.7416 0.7571
gigo 0.7593 0.6963 0.7264
lysgroup 0.4592 0.6296 0.531
</table>
<tableCaption confidence="0.999856">
Table 1: Team Results for the unconstrained task.
</tableCaption>
<bodyText confidence="0.985270428571429">
nary for North American English that contains
over 134,000 words and their pronunciations”3
B. count 1w.txt from Peter Norvig: “The 1/3 mil-
lion most frequent words, all lowercase, with
counts.” 4
C. 10 million tweets collected by Steven Bedrick,
of Oregon Health and Sciences University.
</bodyText>
<figure confidence="0.903220333333333">
D. WNUT Lexical normalisation dictionaries5
a. UniMelb
b. UTDallas
E. WNUT training set.6
F. Urban Dictionary: a highly inclusive user-
generated online dictionary.7
</figure>
<sectionHeader confidence="0.997094" genericHeader="evaluation">
4 Results
</sectionHeader>
<bodyText confidence="0.999891">
Table 1 shows the results for the unconstrained task.
The project described in the present work is named
“bekli”. The training set consisted of 2024 tweets
with a total of 3928 tokens that needed to be nor-
malized. The test set consisted of 1967 tweets with
a total of 2738 tokens that needed to be normalized
Baldwin et al. (2015).
</bodyText>
<sectionHeader confidence="0.995189" genericHeader="conclusions">
5 Conclusion
</sectionHeader>
<bodyText confidence="0.996880833333333">
The results show that a simple strategy with minimal
computational resources can go along way. For ex-
ample, the space required for the list and rule-based
components is negligible. The only element that re-
quires some heavy lifting is the sentence-level re-
ranker with its long list of bigrams.
</bodyText>
<footnote confidence="0.9999316">
3http://www.speech.cs.cmu.edu/cgi-bin/cmudict
4http://norvig.com/ngrams/count 1w.txt
5http://noisy-text.github.io/norm-shared-task.html
6http://noisy-text.github.io/norm-shared-task.html
7http://www.urbandictionary.com/
</footnote>
<bodyText confidence="0.999972066666667">
However, as I described above, selecting the to-
ken with the greatest-emission probability actually
works better than my bigram approach, and requires
far less computation. This leaves the question: what
results could be achieved using a better re-ranker,
one that successfully exploits context? Such a re-
ranker would, among other benefits, make it feasi-
ble to use rules or substitutions that are not, without
using context, capable of high precision.
Another question remaining is how much better
can we do by expanding the curated segment of
the list—if we, for example, double the size? This
would would still allow a program to have a very
small computational imprint, while doing nearly the
work of a more sophisticated system.
</bodyText>
<sectionHeader confidence="0.998914" genericHeader="references">
References
</sectionHeader>
<reference confidence="0.9978715">
Akshat Bakliwal, Jennifer Foster, Jennifer van der Puil,
Ron O’Brien, Lamia Tounsi, and Mark Hughes. 2013.
Sentiment analysis of political tweets: Towards an ac-
curate classifier. In Proceedings of the Workshop on
Language Analysis in Social Media, pages 49–58, At-
lanta, Georgia, June. Association for Computational
Linguistics.
Timothy Baldwin, Marie Catherine de Marneffe, Bo Han,
Young-Bum Kim, Alan Ritter, and Wei Xu. 2015.
Shared tasks of the 2015 workshop on noisy user-
generated text: Twitter lexical normalization and
named entity recognition. In Proceedings of the Work-
shop on Noisy User-generated Text (WNUT 2015),
Beijing, China.
Luciano Barbosa and Junlan Feng. 2010. Robust senti-
ment detection on twitter from biased and noisy data.
In Proceedings of the 23rd International Conference
on Computational Linguistics: Posters, pages 36–44.
Cynthia Chew and Gunther Eysenbach. 2010. Pan-
demics in the age of twitter: content analysis of
tweets during the 2009 h1n1 outbreak. PloS one,
5(11):e14118.
Monojit Choudhury, Rahul Saraf, Vijit Jain, Animesh
Mukherjee, Sudeshna Sarkar, and Anupam Basu.
</reference>
<page confidence="0.993113">
85
</page>
<reference confidence="0.999052844444445">
2007. Investigation and modeling of the structure
of texting language. Int. J. Doc. Anal. Recognit.,
10(3):157–174, December.
Bo Han and Timothy Baldwin. 2011. Lexical normal-
isation of short text messages: Makn sens a #twitter.
In Proceedings of the 49th Annual Meeting of the As-
sociation for Computational Linguistics: Human Lan-
guage Technologies - Volume 1, HLT ’11, pages 368–
378, Stroudsburg, PA, USA. Association for Compu-
tational Linguistics.
Catherine Kobus, Franc¸ois Yvon, and G´eraldine
Damnati. 2008. Normalizing sms: Are two metaphors
better than one? In Proceedings of the 22Nd Inter-
national Conference on Computational Linguistics -
Volume 1, COLING ’08, pages 441–448, Stroudsburg,
PA, USA. Association for Computational Linguistics.
Chen Li and Yang Liu. 2014. Improving text nor-
malization via unsupervised model and discriminative
reranking. In Proceedings of the ACL 2014 Student
Research Workshop, pages 86–93, Baltimore, Mary-
land, USA, June. Association for Computational Lin-
guistics.
Jiwei Li, Alan Ritter, and Eduard H. Hovy. 2014. Weakly
supervised user profile extraction from twitter. In Pro-
ceedings of the 52nd Annual Meeting of the Associ-
ation for Computational Linguistics, ACL 2014, June
22-27, 2014, Baltimore, MD, USA, Volume 1: Long
Papers, pages 165–174.
Sara Rosenthal, Preslav Nakov, Svetlana Kiritchenko,
Saif M Mohammad, Alan Ritter, and Veselin Stoy-
anov. 2015. Semeval-2015 task 10: Sentiment analy-
sis in twitter. In Proceedings of the 9th International
Workshop on Semantic Evaluation, SemEval.
Richard Sproat, Alan W Black, Stanley Chen, Shankar
Kumar, Mari Ostendorf, and Christopher Richards.
2001. Normalization of non-standard words. Com-
puter Speech &amp; Language, 15(3):287–333.
Yi Yang and Jacob Eisenstein. 2013. A log-linear model
for unsupervised text normalization. In EMNLP,
pages 61–72.
Congle Zhang, Tyler Baldwin, Howard Ho, Benny
Kimelfeld, and Yunyao Li. 2013. Adaptive parser-
centric text normalization. In Proceedings of the 49th
Annual Meeting of the Association for Computational
Linguistics (1), pages 1159–1168.
</reference>
<page confidence="0.998511">
86
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.506129">
<title confidence="0.99956">Bekli: A Simple Approach to Twitter Text Normalization</title>
<author confidence="0.950774">Russell</author>
<affiliation confidence="0.995785">Oregon Health and Sciences</affiliation>
<address confidence="0.595335">Portland,</address>
<email confidence="0.999942">beckleyr@ohsu.edu</email>
<abstract confidence="0.991928846153846">Every day, Twitter users generate vast quantities of potentially useful information in the form of written language. Due to Twitter’s frequently informal tone, text normalization can be a crucial element for exploiting that information. This paper outlines our approach to text normalization used in the WNUT shared task. We show that a very simple solution, powered by a modestly sized, partiallycurated wordlist—combined with a modest reranking scheme—can deliver respectable results.</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="true">
<authors>
<author>Akshat Bakliwal</author>
<author>Jennifer Foster</author>
<author>Jennifer van der Puil</author>
<author>Ron O’Brien</author>
<author>Lamia Tounsi</author>
<author>Mark Hughes</author>
</authors>
<title>Sentiment analysis of political tweets: Towards an accurate classifier.</title>
<date>2013</date>
<booktitle>In Proceedings of the Workshop on Language Analysis in Social Media,</booktitle>
<pages>49--58</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Atlanta, Georgia,</location>
<marker>Bakliwal, Foster, van der Puil, O’Brien, Tounsi, Hughes, 2013</marker>
<rawString>Akshat Bakliwal, Jennifer Foster, Jennifer van der Puil, Ron O’Brien, Lamia Tounsi, and Mark Hughes. 2013. Sentiment analysis of political tweets: Towards an accurate classifier. In Proceedings of the Workshop on Language Analysis in Social Media, pages 49–58, Atlanta, Georgia, June. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Timothy Baldwin</author>
<author>Marie Catherine de Marneffe</author>
<author>Bo Han</author>
<author>Young-Bum Kim</author>
<author>Alan Ritter</author>
<author>Wei Xu</author>
</authors>
<title>Shared tasks of the 2015 workshop on noisy usergenerated text: Twitter lexical normalization and named entity recognition.</title>
<date>2015</date>
<booktitle>In Proceedings of the Workshop on Noisy User-generated Text (WNUT 2015),</booktitle>
<location>Beijing, China.</location>
<marker>Baldwin, de Marneffe, Han, Kim, Ritter, Xu, 2015</marker>
<rawString>Timothy Baldwin, Marie Catherine de Marneffe, Bo Han, Young-Bum Kim, Alan Ritter, and Wei Xu. 2015. Shared tasks of the 2015 workshop on noisy usergenerated text: Twitter lexical normalization and named entity recognition. In Proceedings of the Workshop on Noisy User-generated Text (WNUT 2015), Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Luciano Barbosa</author>
<author>Junlan Feng</author>
</authors>
<title>Robust sentiment detection on twitter from biased and noisy data.</title>
<date>2010</date>
<booktitle>In Proceedings of the 23rd International Conference on Computational Linguistics: Posters,</booktitle>
<pages>36--44</pages>
<contexts>
<context position="990" citStr="Barbosa and Feng (2010)" startWordPosition="146" endWordPosition="149">ting that information. This paper outlines our approach to text normalization used in the WNUT shared task. We show that a very simple solution, powered by a modestly sized, partiallycurated wordlist—combined with a modest reranking scheme—can deliver respectable results. 1 Introduction Twitter is an immense, living collection of written language from all over the world. Every day, Twitter publishes a staggering 500 million tweets1. The content of Twitter is virtually unlimited, and has proven useful for much research, including epidemiology: Chew and Eysenbach (2010); and sentiment analysis: Barbosa and Feng (2010), Bakliwal et al. (2013), Rosenthal et al. (2015), Li et al. (2014). It would take many readers to keep up with Twitter’s output, but, fortunately, we have natural language processing (NLP) methods that can automatically filter, condense, or extract information from text. However, NLP approaches are typically trained on formal edited text, and struggle with the informal, unedited text of Twitter. But there is a wellknown way to mitigate this problem: text normalization, i.e. replacing non-standard tokens with their standard equivalents, yielding text that will be more agreeable to NLP. 1https:</context>
</contexts>
<marker>Barbosa, Feng, 2010</marker>
<rawString>Luciano Barbosa and Junlan Feng. 2010. Robust sentiment detection on twitter from biased and noisy data. In Proceedings of the 23rd International Conference on Computational Linguistics: Posters, pages 36–44.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Cynthia Chew</author>
<author>Gunther Eysenbach</author>
</authors>
<title>Pandemics in the age of twitter: content analysis of tweets during the 2009 h1n1 outbreak.</title>
<date>2010</date>
<journal>PloS one,</journal>
<volume>5</volume>
<issue>11</issue>
<contexts>
<context position="941" citStr="Chew and Eysenbach (2010)" startWordPosition="139" endWordPosition="142">t normalization can be a crucial element for exploiting that information. This paper outlines our approach to text normalization used in the WNUT shared task. We show that a very simple solution, powered by a modestly sized, partiallycurated wordlist—combined with a modest reranking scheme—can deliver respectable results. 1 Introduction Twitter is an immense, living collection of written language from all over the world. Every day, Twitter publishes a staggering 500 million tweets1. The content of Twitter is virtually unlimited, and has proven useful for much research, including epidemiology: Chew and Eysenbach (2010); and sentiment analysis: Barbosa and Feng (2010), Bakliwal et al. (2013), Rosenthal et al. (2015), Li et al. (2014). It would take many readers to keep up with Twitter’s output, but, fortunately, we have natural language processing (NLP) methods that can automatically filter, condense, or extract information from text. However, NLP approaches are typically trained on formal edited text, and struggle with the informal, unedited text of Twitter. But there is a wellknown way to mitigate this problem: text normalization, i.e. replacing non-standard tokens with their standard equivalents, yielding</context>
</contexts>
<marker>Chew, Eysenbach, 2010</marker>
<rawString>Cynthia Chew and Gunther Eysenbach. 2010. Pandemics in the age of twitter: content analysis of tweets during the 2009 h1n1 outbreak. PloS one, 5(11):e14118.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Monojit Choudhury</author>
<author>Rahul Saraf</author>
<author>Vijit Jain</author>
<author>Animesh Mukherjee</author>
<author>Sudeshna Sarkar</author>
<author>Anupam Basu</author>
</authors>
<title>Investigation and modeling of the structure of texting language.</title>
<date>2007</date>
<journal>Int. J. Doc. Anal. Recognit.,</journal>
<volume>10</volume>
<issue>3</issue>
<contexts>
<context position="2170" citStr="Choudhury et al. (2007)" startWordPosition="326" endWordPosition="329">that will be more agreeable to NLP. 1https://blog.twitter.com/2013/new-tweets-per-secondrecord-and-how One flavor of non-standard writing—what I have previously focused on—is what I call “vernacular orthography” (VO). VO is spelling that indicates itentional non-standard pronunciation, such as when the string “dat” stands in for “that”. While numerous papers offer solutions for text normalization (e.g. Han and Baldwin (2011), Yang and Eisenstein (2013), Zhang et al. (2013), Sproat et al. (2001), Li and Liu (2014)), and a few build models based on phonemic similarity (e.g. Kobus et al. (2008), Choudhury et al. (2007)), none to our knowledge have addressed VO in particular. This paper, too, addresses the general normalization problem, but uses lessons learned attempting to normalize VO. 2 System Architecture The architecture of this system is very simple, consisting of three main parts: (1) a substitution list, (2) a couple of rule based components, and (3) a sentence level re-ranker. This provides for a fast pertoken performance. 2.1 Substitution List Most of the work is done by a semi-supervised substitution list consisting of ordered pairs. The first member of each pair is a string representing a nonsta</context>
</contexts>
<marker>Choudhury, Saraf, Jain, Mukherjee, Sarkar, Basu, 2007</marker>
<rawString>Monojit Choudhury, Rahul Saraf, Vijit Jain, Animesh Mukherjee, Sudeshna Sarkar, and Anupam Basu. 2007. Investigation and modeling of the structure of texting language. Int. J. Doc. Anal. Recognit., 10(3):157–174, December.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Bo Han</author>
<author>Timothy Baldwin</author>
</authors>
<title>Lexical normalisation of short text messages: Makn sens a #twitter.</title>
<date>2011</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies - Volume 1, HLT ’11,</booktitle>
<pages>368--378</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA.</location>
<contexts>
<context position="1975" citStr="Han and Baldwin (2011)" startWordPosition="292" endWordPosition="295"> informal, unedited text of Twitter. But there is a wellknown way to mitigate this problem: text normalization, i.e. replacing non-standard tokens with their standard equivalents, yielding text that will be more agreeable to NLP. 1https://blog.twitter.com/2013/new-tweets-per-secondrecord-and-how One flavor of non-standard writing—what I have previously focused on—is what I call “vernacular orthography” (VO). VO is spelling that indicates itentional non-standard pronunciation, such as when the string “dat” stands in for “that”. While numerous papers offer solutions for text normalization (e.g. Han and Baldwin (2011), Yang and Eisenstein (2013), Zhang et al. (2013), Sproat et al. (2001), Li and Liu (2014)), and a few build models based on phonemic similarity (e.g. Kobus et al. (2008), Choudhury et al. (2007)), none to our knowledge have addressed VO in particular. This paper, too, addresses the general normalization problem, but uses lessons learned attempting to normalize VO. 2 System Architecture The architecture of this system is very simple, consisting of three main parts: (1) a substitution list, (2) a couple of rule based components, and (3) a sentence level re-ranker. This provides for a fast perto</context>
</contexts>
<marker>Han, Baldwin, 2011</marker>
<rawString>Bo Han and Timothy Baldwin. 2011. Lexical normalisation of short text messages: Makn sens a #twitter. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies - Volume 1, HLT ’11, pages 368– 378, Stroudsburg, PA, USA. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Catherine Kobus</author>
<author>Franc¸ois Yvon</author>
<author>G´eraldine Damnati</author>
</authors>
<title>Normalizing sms: Are two metaphors better than one?</title>
<date>2008</date>
<booktitle>In Proceedings of the 22Nd International Conference on Computational Linguistics -Volume 1, COLING ’08,</booktitle>
<pages>441--448</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA.</location>
<contexts>
<context position="2145" citStr="Kobus et al. (2008)" startWordPosition="322" endWordPosition="325">lents, yielding text that will be more agreeable to NLP. 1https://blog.twitter.com/2013/new-tweets-per-secondrecord-and-how One flavor of non-standard writing—what I have previously focused on—is what I call “vernacular orthography” (VO). VO is spelling that indicates itentional non-standard pronunciation, such as when the string “dat” stands in for “that”. While numerous papers offer solutions for text normalization (e.g. Han and Baldwin (2011), Yang and Eisenstein (2013), Zhang et al. (2013), Sproat et al. (2001), Li and Liu (2014)), and a few build models based on phonemic similarity (e.g. Kobus et al. (2008), Choudhury et al. (2007)), none to our knowledge have addressed VO in particular. This paper, too, addresses the general normalization problem, but uses lessons learned attempting to normalize VO. 2 System Architecture The architecture of this system is very simple, consisting of three main parts: (1) a substitution list, (2) a couple of rule based components, and (3) a sentence level re-ranker. This provides for a fast pertoken performance. 2.1 Substitution List Most of the work is done by a semi-supervised substitution list consisting of ordered pairs. The first member of each pair is a str</context>
</contexts>
<marker>Kobus, Yvon, Damnati, 2008</marker>
<rawString>Catherine Kobus, Franc¸ois Yvon, and G´eraldine Damnati. 2008. Normalizing sms: Are two metaphors better than one? In Proceedings of the 22Nd International Conference on Computational Linguistics -Volume 1, COLING ’08, pages 441–448, Stroudsburg, PA, USA. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Chen Li</author>
<author>Yang Liu</author>
</authors>
<title>Improving text normalization via unsupervised model and discriminative reranking.</title>
<date>2014</date>
<booktitle>In Proceedings of the ACL 2014 Student Research Workshop,</booktitle>
<pages>86--93</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Baltimore, Maryland, USA,</location>
<contexts>
<context position="2065" citStr="Li and Liu (2014)" startWordPosition="308" endWordPosition="311">t normalization, i.e. replacing non-standard tokens with their standard equivalents, yielding text that will be more agreeable to NLP. 1https://blog.twitter.com/2013/new-tweets-per-secondrecord-and-how One flavor of non-standard writing—what I have previously focused on—is what I call “vernacular orthography” (VO). VO is spelling that indicates itentional non-standard pronunciation, such as when the string “dat” stands in for “that”. While numerous papers offer solutions for text normalization (e.g. Han and Baldwin (2011), Yang and Eisenstein (2013), Zhang et al. (2013), Sproat et al. (2001), Li and Liu (2014)), and a few build models based on phonemic similarity (e.g. Kobus et al. (2008), Choudhury et al. (2007)), none to our knowledge have addressed VO in particular. This paper, too, addresses the general normalization problem, but uses lessons learned attempting to normalize VO. 2 System Architecture The architecture of this system is very simple, consisting of three main parts: (1) a substitution list, (2) a couple of rule based components, and (3) a sentence level re-ranker. This provides for a fast pertoken performance. 2.1 Substitution List Most of the work is done by a semi-supervised subst</context>
</contexts>
<marker>Li, Liu, 2014</marker>
<rawString>Chen Li and Yang Liu. 2014. Improving text normalization via unsupervised model and discriminative reranking. In Proceedings of the ACL 2014 Student Research Workshop, pages 86–93, Baltimore, Maryland, USA, June. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Jiwei Li</author>
<author>Alan Ritter</author>
<author>Eduard H Hovy</author>
</authors>
<title>Weakly supervised user profile extraction from twitter.</title>
<date>2014</date>
<booktitle>In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics, ACL</booktitle>
<volume>Volume</volume>
<pages>165--174</pages>
<location>Baltimore, MD, USA,</location>
<contexts>
<context position="1057" citStr="Li et al. (2014)" startWordPosition="158" endWordPosition="161">ion used in the WNUT shared task. We show that a very simple solution, powered by a modestly sized, partiallycurated wordlist—combined with a modest reranking scheme—can deliver respectable results. 1 Introduction Twitter is an immense, living collection of written language from all over the world. Every day, Twitter publishes a staggering 500 million tweets1. The content of Twitter is virtually unlimited, and has proven useful for much research, including epidemiology: Chew and Eysenbach (2010); and sentiment analysis: Barbosa and Feng (2010), Bakliwal et al. (2013), Rosenthal et al. (2015), Li et al. (2014). It would take many readers to keep up with Twitter’s output, but, fortunately, we have natural language processing (NLP) methods that can automatically filter, condense, or extract information from text. However, NLP approaches are typically trained on formal edited text, and struggle with the informal, unedited text of Twitter. But there is a wellknown way to mitigate this problem: text normalization, i.e. replacing non-standard tokens with their standard equivalents, yielding text that will be more agreeable to NLP. 1https://blog.twitter.com/2013/new-tweets-per-secondrecord-and-how One fla</context>
</contexts>
<marker>Li, Ritter, Hovy, 2014</marker>
<rawString>Jiwei Li, Alan Ritter, and Eduard H. Hovy. 2014. Weakly supervised user profile extraction from twitter. In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics, ACL 2014, June 22-27, 2014, Baltimore, MD, USA, Volume 1: Long Papers, pages 165–174.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Sara Rosenthal</author>
<author>Preslav Nakov</author>
<author>Svetlana Kiritchenko</author>
<author>Saif M Mohammad</author>
<author>Alan Ritter</author>
<author>Veselin Stoyanov</author>
</authors>
<title>Semeval-2015 task 10: Sentiment analysis in twitter.</title>
<date>2015</date>
<booktitle>In Proceedings of the 9th International Workshop on Semantic Evaluation, SemEval.</booktitle>
<contexts>
<context position="1039" citStr="Rosenthal et al. (2015)" startWordPosition="154" endWordPosition="157">proach to text normalization used in the WNUT shared task. We show that a very simple solution, powered by a modestly sized, partiallycurated wordlist—combined with a modest reranking scheme—can deliver respectable results. 1 Introduction Twitter is an immense, living collection of written language from all over the world. Every day, Twitter publishes a staggering 500 million tweets1. The content of Twitter is virtually unlimited, and has proven useful for much research, including epidemiology: Chew and Eysenbach (2010); and sentiment analysis: Barbosa and Feng (2010), Bakliwal et al. (2013), Rosenthal et al. (2015), Li et al. (2014). It would take many readers to keep up with Twitter’s output, but, fortunately, we have natural language processing (NLP) methods that can automatically filter, condense, or extract information from text. However, NLP approaches are typically trained on formal edited text, and struggle with the informal, unedited text of Twitter. But there is a wellknown way to mitigate this problem: text normalization, i.e. replacing non-standard tokens with their standard equivalents, yielding text that will be more agreeable to NLP. 1https://blog.twitter.com/2013/new-tweets-per-secondreco</context>
</contexts>
<marker>Rosenthal, Nakov, Kiritchenko, Mohammad, Ritter, Stoyanov, 2015</marker>
<rawString>Sara Rosenthal, Preslav Nakov, Svetlana Kiritchenko, Saif M Mohammad, Alan Ritter, and Veselin Stoyanov. 2015. Semeval-2015 task 10: Sentiment analysis in twitter. In Proceedings of the 9th International Workshop on Semantic Evaluation, SemEval.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Richard Sproat</author>
<author>Alan W Black</author>
<author>Stanley Chen</author>
<author>Shankar Kumar</author>
<author>Mari Ostendorf</author>
<author>Christopher Richards</author>
</authors>
<title>Normalization of non-standard words.</title>
<date>2001</date>
<journal>Computer Speech &amp; Language,</journal>
<volume>15</volume>
<issue>3</issue>
<contexts>
<context position="2046" citStr="Sproat et al. (2001)" startWordPosition="304" endWordPosition="307">gate this problem: text normalization, i.e. replacing non-standard tokens with their standard equivalents, yielding text that will be more agreeable to NLP. 1https://blog.twitter.com/2013/new-tweets-per-secondrecord-and-how One flavor of non-standard writing—what I have previously focused on—is what I call “vernacular orthography” (VO). VO is spelling that indicates itentional non-standard pronunciation, such as when the string “dat” stands in for “that”. While numerous papers offer solutions for text normalization (e.g. Han and Baldwin (2011), Yang and Eisenstein (2013), Zhang et al. (2013), Sproat et al. (2001), Li and Liu (2014)), and a few build models based on phonemic similarity (e.g. Kobus et al. (2008), Choudhury et al. (2007)), none to our knowledge have addressed VO in particular. This paper, too, addresses the general normalization problem, but uses lessons learned attempting to normalize VO. 2 System Architecture The architecture of this system is very simple, consisting of three main parts: (1) a substitution list, (2) a couple of rule based components, and (3) a sentence level re-ranker. This provides for a fast pertoken performance. 2.1 Substitution List Most of the work is done by a se</context>
</contexts>
<marker>Sproat, Black, Chen, Kumar, Ostendorf, Richards, 2001</marker>
<rawString>Richard Sproat, Alan W Black, Stanley Chen, Shankar Kumar, Mari Ostendorf, and Christopher Richards. 2001. Normalization of non-standard words. Computer Speech &amp; Language, 15(3):287–333.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Yi Yang</author>
<author>Jacob Eisenstein</author>
</authors>
<title>A log-linear model for unsupervised text normalization.</title>
<date>2013</date>
<booktitle>In EMNLP,</booktitle>
<pages>61--72</pages>
<contexts>
<context position="2003" citStr="Yang and Eisenstein (2013)" startWordPosition="296" endWordPosition="299"> of Twitter. But there is a wellknown way to mitigate this problem: text normalization, i.e. replacing non-standard tokens with their standard equivalents, yielding text that will be more agreeable to NLP. 1https://blog.twitter.com/2013/new-tweets-per-secondrecord-and-how One flavor of non-standard writing—what I have previously focused on—is what I call “vernacular orthography” (VO). VO is spelling that indicates itentional non-standard pronunciation, such as when the string “dat” stands in for “that”. While numerous papers offer solutions for text normalization (e.g. Han and Baldwin (2011), Yang and Eisenstein (2013), Zhang et al. (2013), Sproat et al. (2001), Li and Liu (2014)), and a few build models based on phonemic similarity (e.g. Kobus et al. (2008), Choudhury et al. (2007)), none to our knowledge have addressed VO in particular. This paper, too, addresses the general normalization problem, but uses lessons learned attempting to normalize VO. 2 System Architecture The architecture of this system is very simple, consisting of three main parts: (1) a substitution list, (2) a couple of rule based components, and (3) a sentence level re-ranker. This provides for a fast pertoken performance. 2.1 Substit</context>
</contexts>
<marker>Yang, Eisenstein, 2013</marker>
<rawString>Yi Yang and Jacob Eisenstein. 2013. A log-linear model for unsupervised text normalization. In EMNLP, pages 61–72.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Congle Zhang</author>
<author>Tyler Baldwin</author>
<author>Howard Ho</author>
<author>Benny Kimelfeld</author>
<author>Yunyao Li</author>
</authors>
<title>Adaptive parsercentric text normalization.</title>
<date>2013</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics</booktitle>
<volume>1</volume>
<pages>1159--1168</pages>
<contexts>
<context position="2024" citStr="Zhang et al. (2013)" startWordPosition="300" endWordPosition="303">wellknown way to mitigate this problem: text normalization, i.e. replacing non-standard tokens with their standard equivalents, yielding text that will be more agreeable to NLP. 1https://blog.twitter.com/2013/new-tweets-per-secondrecord-and-how One flavor of non-standard writing—what I have previously focused on—is what I call “vernacular orthography” (VO). VO is spelling that indicates itentional non-standard pronunciation, such as when the string “dat” stands in for “that”. While numerous papers offer solutions for text normalization (e.g. Han and Baldwin (2011), Yang and Eisenstein (2013), Zhang et al. (2013), Sproat et al. (2001), Li and Liu (2014)), and a few build models based on phonemic similarity (e.g. Kobus et al. (2008), Choudhury et al. (2007)), none to our knowledge have addressed VO in particular. This paper, too, addresses the general normalization problem, but uses lessons learned attempting to normalize VO. 2 System Architecture The architecture of this system is very simple, consisting of three main parts: (1) a substitution list, (2) a couple of rule based components, and (3) a sentence level re-ranker. This provides for a fast pertoken performance. 2.1 Substitution List Most of th</context>
</contexts>
<marker>Zhang, Baldwin, Ho, Kimelfeld, Li, 2013</marker>
<rawString>Congle Zhang, Tyler Baldwin, Howard Ho, Benny Kimelfeld, and Yunyao Li. 2013. Adaptive parsercentric text normalization. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics (1), pages 1159–1168.</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>