<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.053169">
<title confidence="0.997117">
NCSU_SAS_SAM: Deep Encoding and Reconstruction
for Normalization of Noisy Text
</title>
<author confidence="0.995575">
Samuel P. Leeman-Munk James C. Lester
</author>
<affiliation confidence="0.810424">
Center for Educational Informatics
North Carolina State University
</affiliation>
<address confidence="0.880106">
Raleigh, NC, USA
</address>
<email confidence="0.999389">
{spleeman, lester}@ncsu.edu
</email>
<author confidence="0.859214">
James A. Cox
</author>
<affiliation confidence="0.8014065">
Text Analytics R&amp;D
SAS Institute Inc.
</affiliation>
<address confidence="0.873779">
Cary, NC, USA
</address>
<email confidence="0.998044">
james.cox@sas.com
</email>
<sectionHeader confidence="0.99563" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.999933">
As a participant in the W-NUT Lexical
Normalization for English Tweets chal-
lenge, we use deep learning to address
the constrained task. Specifically, we use
a combination of two augmented feed
forward neural networks, a flagger that
identifies words to be normalized and a
normalizer, to take in a single token at a
time and output a corrected version of
that token. Despite avoiding off-the-shelf
tools trained on external data and being
an entirely context-free model, our sys-
tem still achieved an F1-score of 81.49%,
comfortably surpassing the next runner
up by 1.5% and trailing the second place
model by only 0.26%.
</bodyText>
<sectionHeader confidence="0.999133" genericHeader="keywords">
1 Introduction
</sectionHeader>
<bodyText confidence="0.999866333333333">
The phenomenal growth of social media, web
forums, and online reviews has spurred a grow-
ing interest in automated analysis of user-
generated text. User-generated text presents sig-
nificant computational challenges because it is
often highly disfluent. To address these chal-
lenges, we have begun to see a growing demand
for tools and techniques to transform noisy user-
generated text into a canonical form, most re-
cently in the Workshop on Noisy User Text at
the Association for Computational Linguistics.
This work describes a submission to the Lexical
Normalization for English Tweets challenge as
part of this workshop (Baldwin et al., 2015)
Motivated by the success of prior deep neural
network architectures, particularly denoising au-
toencoders, we have developed an approach to
transform noisy user-generated text into a canon-
ical form with a feed-forward neural network
augmented with a projection layer (Collobert et
al., 2011; Kalchbrenner, Grefenstette, &amp;
Blunsom, 2014; Vincent, Larochelle, Bengio, &amp;
Manzagol, 2008). The model performs a charac-
ter-level analysis on each word of the input. The
absence of hand-engineered features and the
avoidance of direct and indirect external data
make this model unique among the three top-
performing models in the constrained task.
This paper is organized as follows. In Sec-
tion 2 we describe each component of our model.
In Section 3 we describe the specific instantia-
tion of our model, and in Section 4 we present
and discuss results.
</bodyText>
<sectionHeader confidence="0.927211" genericHeader="introduction">
2 Architecture and Components
</sectionHeader>
<bodyText confidence="0.999945357142857">
Our model consists of three components: a Nor-
malizer that encodes the input and then recon-
structs it in normalized form, a Flagger that de-
termines whether the Normalizer should be used
or if the word should be taken as-is, and a Con-
former that attempts to smooth out simple errors
introduced by quirks in the Normalizer.
In this section we will use the simple example
transformation of “u” to “you” where “u” is the
input text and “you” is the gold standard normal-
ization. In our example we use a maximum word
size of three. Figure 1 shows the flow of our ex-
ample through the model. In broad overview, the
input is preprocessed and sent to both the Nor-
</bodyText>
<page confidence="0.996339">
154
</page>
<note confidence="0.9884435">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 154–161,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<bodyText confidence="0.999724">
malizer and the Flagger. The Normalizer com-
putes a candidate normalization, and the Flagger
determines whether to use that candidate or the
original word. The Normalizer’s output is passed
to the Conformer, which conforms it to a word in
the vocabulary list, and then the candidate, the
flag, and the original input word are passed to a
simple decision component that either keeps the
original word or uses the normalized version
based on the output of the Flagger. While it may
seem inefficient that the normalized version is
always computed, even if it is not used, this ap-
proach is used so that the Normalizer and Flag-
ger can be run in parallel on many inputs at once.
</bodyText>
<subsectionHeader confidence="0.976539">
2.1 Deep Feed-Forward Neural Networks
</subsectionHeader>
<bodyText confidence="0.999937136363636">
As the central element of the Flagger and the
Normalizer, the deep feed-forward neural net-
work forms the basis of our model. A deep feed-
forward neural network takes a vector of num-
bers as input. This vector is known as a layer and
each value within it is a neuron. The network
A deep feed-forward neural network can contain
any number of hidden layers, each going through
the same process, multiplying by a matrix of
weights and transforming via a non-linearity.
Hidden layers may also be of any size. Multiple
applications of learnable weight matrices and
non-linear transformations together allow a deep
neural network to represent complex relation-
ships between input and output (Bengio, 2009).
Deep feed-forward neural networks are trained
by backpropagation. Backpropagation is a train-
ing method by which the gradient of any given
weight in a network can be calculated from the
error between the output of the network and a
gold standard. It is described in more detail in
(Rumelhart, Hinton, &amp; Williams, 1986).
</bodyText>
<subsectionHeader confidence="0.999171">
2.2 The Normalizer
</subsectionHeader>
<bodyText confidence="0.99636425">
Our use of deep feed-forward neural networks
for the task of normalization is inspired by the
success of denoising autoencoders. (Vincent et
al., 2008). Denoising autoencoders are neural
</bodyText>
<figureCaption confidence="0.9895875">
Figure 1: A flowchart detailing the process of normalizing a word. Information flows from left to right and ellipses represent
data objects while rectangles represent processes.
</figureCaption>
<bodyText confidence="0.995925">
multiplies the input layer by a matrix of weights
to return another vector. This new vector is then
transformed by a non-linearity. A number of
functions can serve as the non-linearity, includ-
ing the sigmoid and the hyperbolic tangent, but
our model uses a rectified linear unit, given by
the following expression.
</bodyText>
<equation confidence="0.783824">
𝑦= max 𝑥, 0
</equation>
<bodyText confidence="0.999974833333333">
The rectified linear unit has been successful in a
number of natural language tasks such as speech
processing (Zeiler et al., 2013), and it was effec-
tive in an unpublished part-of-speech tagging
model we developed.
The transformed vector is referred to as a hid-
den layer because its values are never directly
observed in the normal functioning of the model.
networks whose output is the same as their input.
That is, they specialize in developing a robust
encoding of an input such that the input can be
reconstructed from the encoding alone. The de-
noising aspect refers to the fact that to encourage
robustness, denoising autoencoders are given
inputs that have been deliberately corrupted, or
“noised” and are expected to reconstruct them
without the noise. It is this “denoising” aspect
that makes denoising autoencoders so interesting
for text normalization.
The main component of our model, the Nor-
malizer, uses a feed-forward neural network that
functions on a similar principle to that of a de-
noising autoencoder. It reads the character se-
quence that describes the word and encodes it
</bodyText>
<page confidence="0.998505">
155
</page>
<bodyText confidence="0.999856233333333">
internally, outputting the denoised (normalized)
version. It accomplishes this in three sets of lay-
ers. First the character projection layer takes a
string and represents it as a fixed-length numeric
vector. Next, a feed-forward neural network con-
verts the data into its internal representation and,
with a special output layer, into a denoised ver-
sion of the input. Figure 2 shows a diagram of
the Normalizer’s architecture.
The first step of the Normalizer is performed
by the character projection layer (Collobert et al.,
2011). The character projection layer learns
floating point vector representations of charac-
ters, which it concatenates into one large floating
point vector word representation. In our example,
the letter “u” is represented by n floating point
numbers. For example, if n = 3 the representation
for “u” might be [0.1, -1.2, -0.3]. This vector was
chosen arbitrarily, but in the actual model, values
are learned in training. The representations allow
more information to be associated with a charac-
ter than a simple numeric index.
In this simple example, the word “u” is com-
posed of one character, but if it were longer, each
letter would be separately represented. A key
challenge at this point is that a feed-forward neu-
ral network cannot handle an arbitrary number of
inputs. Because each position in the vector is a
neuron matched directly to a set of weights,
changing the size of the vector would require
changing the size of the learned weights, and the
model would have to be retrained.
To accommodate this, we use a fixed window.
Before we send our input to the Normalizer, we
comes [u, _, _] and then is projected and concat-
enated and becomes something like [0.1, -1.2, -
0.3, 1.3, 0.0, -1.1, 1.3, 0.0, -1.1]. Notice that we
have nine values now in our input. That is the
three values from “u” and then the three values
for “_” ([1.3, 0.0, -1.1]) twice, once for each “_”.
After this step, the system has a numeric vector
representation of a word that is always the same
length. It now sends it to the first layer of the
feed-forward neural network. We deliberately
select a large enough window that only in a small
minority of cases does a word have to be reduced
to fit into the window.
The last hidden layer’s values go through one
final matrix multiplication to output a list of val-
ues wv in size, where w is the size of the window
and v is the number of possible characters includ-
ing the padding character, that is, the number of
characters in the alphabet, which is shared be-
tween the input and output layers. In this last
layer the nonlinear transformation is a special
version of the softmax operation.
The softmax operation transforms a vector
such that each of its values is between zero and
one and the new vector sums to one. Mathemati-
cally, it is given as:
</bodyText>
<equation confidence="0.95271">
𝑒zj
K zk
k=1 𝑒
</equation>
<bodyText confidence="0.938529333333333">
Where K is the number of values in the vector. In
our model, K = v, the size of the alphabet. These
individual values can alternately be considered
posterior probabilities for each of the possible
decisions. If each value is mapped to a character,
𝜎 𝑧 1 =
</bodyText>
<figureCaption confidence="0.998143">
Figure 2: A diagram of the Normalizer correcting &amp;quot;u&amp;quot; to &amp;quot;you.&amp;quot; The circles represent values, the lines weights.
</figureCaption>
<bodyText confidence="0.999627333333333">
preprocess it to meet a specified length, filling in
unused spaces with a sentinel padding “charac-
ter” that projects to its own set of learned
weights like the other characters. Since the max-
imum word size in our example is 3, we use a
window of size 3. Therefore, our input “u” be-
one can simply take the highest value to select
the most likely character. In this case, we are
predicting a window of w characters rather than a
single character, so we perform softmax sepa-
rately on each of the w sets of v values in the lay-
er. In prediction, we simply take the index of the
</bodyText>
<page confidence="0.993979">
156
</page>
<bodyText confidence="0.999788">
highest value in each of the w sets, but in training
we take the whole prediction distribution and try
to maximize the likelihood of each correct letter.
We do not attempt to predict character embed-
dings because we are learning them, and the
model would be likely to learn a trivial function
with character embeddings that are all equal.
Training the Normalizer as a whole relies on
generating posterior distributions and attempting
to minimize the total negative log likelihood of
the gold standard. Mathematically, our objective
function is
</bodyText>
<equation confidence="0.983231">
cost = − 𝑙𝑛 𝑝
P∈P
</equation>
<bodyText confidence="0.999922636363636">
Where p is an element in P, the vector of the
probabilities of each gold standard letter. So, if
our model predicts “y” as 75% likely for charac-
ter 1, “o” as 95% likely for character 2, and “u”
as 89% likely for character 3 in our window of
size 3, the negative log likelihoods calculated as
(.29, .05, .12) are summed to get the error. This
sum error gives a simple measurement of per-
formance to optimize, which backpropagates
through the model to learn all the weights de-
scribed above (Rumelhart et al., 1986).
</bodyText>
<subsectionHeader confidence="0.999075">
2.3 The Flagger
</subsectionHeader>
<bodyText confidence="0.999962217391304">
The Flagger identifies what does and does not
require normalization. The vast majority of the
training data (91%) does not require normaliza-
tion, so returning the reconstructed encoding of
every word would risk incorrectly regenerating
an already canonical token.
The Flagger has the same general structure as
the Normalizer itself except for the final layer.
Instead of generating text at the last layer, a
softmax layer predicts whether the token should
be normalized at all. Thus, the Flagger’s output
layer is two neurons in size, one representing the
flag “Do Normalize,” and another representing
the flag “Do Not Normalize.” In the construction
of the gold standard for the task, there were three
reasons a token would not be normalized: firstly,
the token is already correct, second, the token is
in a protected category (hashtags or foreign
words), or third, it was simply unrecognizable
such that the human normalizer could not find
the correct form. The Flagger accounts for but
does not distinguish between these three possibil-
ities.
</bodyText>
<subsectionHeader confidence="0.993489">
2.4 The Conformer
</subsectionHeader>
<bodyText confidence="0.999979193548387">
Even when a token should be corrected, it is pos-
sible that the normalizer will come very close to
correcting it without succeeding. Reconstructing
the word “laughing,” for instance, the normalizer
can fail completely if it predicts even one letter
wrong. An early analysis of validation data found
that the normalizer had predicted “laugling” in-
stead of laughing. These off-by-one errors are a
frequent enough occurrence to merit a module to
deal with them. The Conformer is also useful for
correctly normalizing rare words whose correct
normalization is too long for the window to rep-
resent. In particular “lmfao” expands to an im-
pressive 27 characters, but if the Normalizer pre-
dicts only the first 25 characters, the Conformer
can easily select the correct token.
To correct these small normalizer errors we
construct the Conformer by collecting a diction-
ary from the gold standard training data. The dic-
tionary is simply a list of all the unique words in
the gold standard data. Then at runtime, whenev-
er the Normalizer runs and predicts a word that is
not present in the dictionary, we replace it with
the closest word in the dictionary according to
Levenshtein distance (Levenshtein, 1966). Ties
are resolved based on which word comes first in
the dictionary. Because Python’s set function,
which does not guarantee a specific order of its
contents, is used to construct the dictionary, the
dictionary’s order is not predictable and thus ties
are resolved unpredictably.
</bodyText>
<sectionHeader confidence="0.98931" genericHeader="method">
3 Settings and Evaluation
</sectionHeader>
<bodyText confidence="0.999989652173913">
The model was implemented in Theano, a Py-
thon library for fast evaluation of multi-
dimensional arrays using matrix operations
(Bastien et al., 2012; Bergstra et al., 2010). We
used Theano’s implementation of backpropaga-
tion to train our model. For our window size, we
selected 25 characters, which is large enough to
completely represent 99.9% of the tokens in the
training data while remaining computationally
feasible. There are also a number of hyper-
parameters: the number and size of hidden lay-
ers, the size of character embeddings, and the
dropout rate. We tried various combinations of
values between 50 and 6000 for the size and 1
and 4 for the number of hidden layers in both our
Normalizer and Flagger. Some combinations we
tried can be seen in the results section. Especially
large sizes and numbers of layers proved to re-
quire more memory than our GPU could support,
and training them on our CPU was exceptionally
slow. We also tried 50% and 75% dropout,
meaning that during training we randomly ex-
cluded hidden nodes from consideration at each
</bodyText>
<page confidence="0.990565">
157
</page>
<bodyText confidence="0.999923191780822">
layer. Dropout has been shown to improve per-
formance by discouraging overfitting on the
training data, and 50% and 75% are common
dropout rates (Hinton, 2014).
We found the highest F1 score on the valida-
tion data for the Normalizer with two hidden lay-
ers of size 2000 each and 50% dropout. This was
close to the maximum size our GPU could sup-
port without reducing the batch size to be too
small to take advantage of the parallelism. The
Flagger’s highest score was found at two hidden
layers of size 1000 each and 75% dropout. At-
tempts to provide hidden layers of different sizes
consistently found inferior results. For the size of
each embedding in the character projection layer,
10 had proven effective earlier in a simpler un-
published Twitter part-of-speech task. We select-
ed 25 for our character embedding size to ac-
count for the greater complexity of a normaliza-
tion task.
We separated the provided training data into
90% training data, 5% validation data and 5%
was held out as test data. In order to construct a
useful model on the small amount of available
data, we iterate training over the same data many
times. Our model stopped training after 150
training iterations in which there was no im-
provement on the validation set. We chose 150
iterations as the smallest value that did not lead
to ending the training at a clearly suboptimal
value. The training also stops at 5,000 iterations
but in practice it converged before reaching this
value.
Early in development we found that the Nor-
malizer had exceptional trouble reconstructing
twitter-specific objects, that is, hash-tags
(#goodday), at-mentions (@marysue) and URLs
(http://blahblah.com). Generally its behavior in
all three cases was to follow the standard marker
characters (@, #, http://) with a string of gibber-
ish unrelated to the word itself. Because these are
protected categories that should not be changed,
we removed them from the training data and rely
on the Flagger to flag them as not to be correct-
ed.
We used layer-wise pre-training, meaning we
first trained with zero hidden layers (going di-
rectly from the character projection to the soft-
max layer) to initialize the character embeddings,
then we trained with one hidden layer, initializ-
ing the character embeddings with their previ-
ously trained values. When we trained the full
model using two hidden layers, we initialized
both the character projection layer and the
weights from the projected input to the first hid-
den layer with the values learned before. The
model continued to learn all the weights it used.
Pretrained weights continued to be trained in the
full model, although “freezing” some pretrained
weights after pretraining and only training later
weights in the full model has shown success
when working with large amounts of unsuper-
vised data and may be worthwhile to consider in
future work (Yosinski, Clune, Bengio, &amp; Lipson,
2014).
Running on an NVIDIA GeForce GTX 680
GPU with 2 GB of onboard memory, training the
Normalizer took about six hours. We do not in-
clude CPU and RAM specifications because they
were not heavily utilized in the GPU implemen-
tation. The Flagger was considerably faster to
train than the Normalizer, taking only a little
over half an hour.
</bodyText>
<sectionHeader confidence="0.999696" genericHeader="evaluation">
4 Results and Discussion
</sectionHeader>
<bodyText confidence="0.999991323529412">
The model earned third place in the competition,
with scores very close to the second place model.
The model’s results in the competition compared
to the first, second, and fourth place models is
shown in Table 1. The precision scores are much
higher than the recall scores for all models be-
cause in this task precision measures the capabil-
ity of the model to not normalize what does not
need normalizing while recall requires that a
model both correctly identify what needs to be
normalized and correctly normalize it.
In addition to the challenge results, we per-
formed a more in-depth analysis on our own
held-out validation and test data. Our analysis of
the scores is shown in Table 2.
Initial data on the Flagger is in Table 3. We
further analyzed the different errors made on the
validation data. Our findings can be found in Ta-
ble 4. Given the large proportion of errors mis-
takenly marked “Do Not Normalize,” we looked
at these errors. A few examples can be found in
Table 5. Although the Flagger was not trained
with Normalizer confidence in mind, it does an
impressive job of only cancelling a normalization
when the normalization is either unnecessary or
would fail. In no case did the Flagger prevent the
Normalizer from making a correct normalization.
An analysis in Figure 3 shows some early re-
sults from using only the Normalizer without a
Conformer or Flagger. To fit this many runs in a
reasonable time span, we used only ten percent
of the training data. In this analysis, error rate is
measured by token. To put the error rates in per-
spective, our final error rate was close to three
</bodyText>
<page confidence="0.996432">
158
</page>
<bodyText confidence="0.999962285714286">
percent. We show this graph to illustrate a num-
ber of points. Particularly, we wish to illustrate
the challenge of encoding and reconstructing
every item in a massive vocabulary, the value of
additional iterations of layer-wise pre-training,
and the large spikes in the error rates at certain
points in the model.
</bodyText>
<table confidence="0.999877">
Model Precision Recall F1-Score
NCSU_SAS 0.9061 0.7865 0.8421
NING
NCSU_SAS 0.9136 0.7398 0.8175
WOOKHEE
NCSU SAS 0.9012 0.7437 0.8149
SAM_
Iitp 0.9026 0.7191 0.8005
</table>
<tableCaption confidence="0.99889">
Table 1: Results of the constrained task
</tableCaption>
<table confidence="0.9999238">
Data Precision Recall F1-
Score Accuracy
Valida- 0.8942 0.7752 0.8305 0.9740
tion
Test 0.8229 0.6870 0.7488 0.9656
</table>
<tableCaption confidence="0.815099">
Table 2: Model Scores on Validation and Test Data
</tableCaption>
<table confidence="0.9999438">
Data Precision Recall F1-
Score Accuracy
Valida- 0.9818 0.9939 0.9878 0.9776
tion
Test 0.9783 0.9930 0.9856 0.9736
</table>
<tableCaption confidence="0.981619">
Table 3: Flagger scores on Validation and Test Data
</tableCaption>
<table confidence="0.999856375">
Error Percentage
Occurrence
Correctly flagged, 13.85%
misnormalized
Mistakenly flagged 66.15%
“Do Not Normalize”
Mistakenly flagged 20.00%
“Do Normalize”
</table>
<tableCaption confidence="0.9455995">
Table 4: Analysis of errors. Percentages given are out of the
total error count.
</tableCaption>
<table confidence="0.999837444444445">
Original Gold Stand- Normalized
ard
FB Facebook fabol
Fuhh f*** fuhh
OPENFOLLOW open follow openffolow
Feela Feels feela
Bkuz because bkuze
Kin kind of kin
Bruuh brother bruuhr
</table>
<tableCaption confidence="0.96563725">
Table 5: Examples of tokens that were mistakenly flagged
&amp;quot;Do Not Normalize.” The “Normalized” column is what the
model would have produced if the Flagger had produced the
flag “Do Normalize”
</tableCaption>
<bodyText confidence="0.999966543859649">
The Normalizer demands much more rep-
resentational power when not assisted by the
Flagger. Before we added the Flagger, we
saw continual improvement of results going
up to four layers of six thousand nodes each.
We saw greater improvements from adding
more nodes per layer than from adding more
layers. The cluster of three lines near the top
all have layers of 1500 or 2000 nodes each,
and the next cluster down is the models we
tried with 4500 and 6000 nodes. Incidentally,
all but the smallest of these models were too
large for our GPU’s 2GB of onboard
memory. As a reminder, after we added the
flagger, we only required two layers of 2000
nodes each to get competitive results. In each
case we used a dropout rate of 50%.
The default models pre-trained each layer for
250 iterations and we also trained models with
the same structure for 500 iterations. We find a
noticeable improvement in the error rate for the
models that were pre-trained for more iterations.
In the graph, the models with more pre-training
make up the cluster of lines near the bottom of
the graph.
Looking at the graphs, one may notice that
some lines have brief spikes multiple percentage
points in size. Because it only takes a one-letter
mistake for a word to be misnormalized, we ex-
pect that at these times a small error arose that
affected a large number of words. It is worth
pointing out that each model continues to im-
prove while in its spike, eventually dropping
back to pre-spike levels.
The model is unique among the three top-
performing models in that it avoids external data
both directly and through indirect sources. The
constrained task does not allow external data, but
it does allow the use of off-the-shelf tools trained
on external data. Our model does not use any
such tools. Without the assistance of tools such
as part-of-speech taggers, attempts to use context
proved ineffective, likely because of increased
sparsity. A given word that appears in the train-
ing set three hundred times may only appear
three times after another particular word, and
may not occur more than once with a particular
prior word and following word, so it is more dif-
ficult to find patterns in limited data. Future
work could either attempt to use tools to provide
additional information or could simply take ad-
vantage of large amounts of data to learn directly
the relationships such tools traditionally abstract
for the benefit of conventional machine learning.
There is one other point: the human graders
often made different decisions about whether or
what a term should be normalized to. For exam-
</bodyText>
<page confidence="0.996491">
159
</page>
<bodyText confidence="0.999887730769231">
ple, sometimes the word “pics” used to refer to
pictures was normalized to “pictures” but other
times it was left as “pics”. These inconsistencies
in the gold standard make it difficult to accurate-
ly judge the quality of the models submitted. Oc-
casionally when we examined mistakes the mod-
el made, we found that the model’s prediction
was correct according to the gold standard, but
that the gold standard was wrong. An inter-rater
reliability measure would help us to gauge not
only how well our models compare to each other
but how they compare to agreement between
human coders.
this happened much less often than having the
system normalize incorrectly. A model that pre-
dicts words from a vocabulary instead of recon-
structing them would be faster to train and would
not require a Conformer, and, considering the top
two models were vocabulary based, might out-
perform our reconstruction-based model.
A second direction for future work centers on
leveraging external data. With more time and
greater computing power, it may be the case that
it is possible to learn sophisticated language
models in an unsupervised fashion from both
standard conversational text and twitter data.
</bodyText>
<figureCaption confidence="0.96913975">
Figure 3: The Normalizer component validation scores by epoch. Model structures are given by “LxN” where L is the size of
each layer and N is the number of layers and more_pretrain indicates that pretraining has continued for 500 instead of 250
iterations, and they cluster at the bottom with the lowest error. To smooth the graphs and make them more interpretable,
values at each epoch are the average of a 10-epoch window.
</figureCaption>
<sectionHeader confidence="0.995292" genericHeader="conclusions">
5 Conclusions and Future Work
</sectionHeader>
<bodyText confidence="0.999902864864865">
Normalization of Twitter text is a challenging
task. With a direct application of simple deep
learning techniques and without relying on any
sources of external data, direct or indirect, we
built a model that performed competitively with
the other models in the task. Our method shows
the ability of deep learning to tackle complex
tasks without labor-intensive hand-engineering
of features.
An important direction for future work is sim-
plifying the normalization pipeline. The need for
a Conformer in particular suggests that there is
room for improvement in the model. Although
constructing the normalized form rather than se-
lecting from a list leaves the possibility open that
a system could normalize to a correct word that
did not appear in the training data, in practice
With this additional data, a model may be able to
effectively use context in distinguishing between
multiple possible normalizations of a word. De-
noising autoencoders in particular are known to
make good use of unsupervised data.
A third direction for future work is to investi-
gate more challenging normalization tasks that
include correction of syntax and do not present
the text already tokenized. These will give us an
opportunity to attempt tasks closer to the chal-
lenges our normalization systems will face in the
real world.
Finally, it will be important to investigate the
overall utility of normalization of text as a pre-
processing step for other analysis. While many
tasks will only benefit from cleaning the data, it
is not clear that the canonical forms of words
retain the same connotations that the original
“noisy” versions held. For a simple example, if
we were to normalize “cooooooool” to “cool” we
</bodyText>
<page confidence="0.959377">
160
</page>
<figure confidence="0.994235833333333">
536. Retrieved from
http://books.google.com/books?hl=en&amp;lr=&amp;id=FJ
blV_iOPjIC&amp;oi=fnd&amp;pg=PA213&amp;dq=learning+re
presentations+by+back-
propagating+errors&amp;ots=zZEk5hHYWU&amp;sig=B8
6wdYsAvCWVEN3aA-RCmw8_IJ8
</figure>
<bodyText confidence="0.9976635">
would lose the emphasis implied by the elonga-
tion of the vowel. For some tasks, it may be im-
portant to retain the information contained in
such non-canonical forms.
</bodyText>
<sectionHeader confidence="0.948438" genericHeader="references">
References
</sectionHeader>
<reference confidence="0.999722951612903">
Baldwin, Timothy, Catherine, Marie, Han, Bo, Kim,
Young-Bum, Ritter, Alan, &amp; Xu, Wei. (2015).
Shared Tasks of the 2015 Workshop on Noisy
User-generated Text: Twitter Lexical
Normalization and Named Entity Recognition. In
Proceedings of the Workshop on Noisy User-
generated Text (WNUT 2015). Beijing, China.
Bastien, Frederic, Lamblin, Pascal, Pascanu, Razvan,
Bergstra, James, Goodfellow, Ian, Bergeron,
Arnaud, ... Bengio, Yoshua. (2012). Theano: New
Features and Speed Improvements. In Deep
Learning and Unsupervised Feature Learning
NIPS 2012 Workshop (pp. 1–10). Retrieved from
http://arxiv.org/abs/1211.5590v1
Bengio, Yoshua. (2009). Learning Deep Architectures
for AI. Foundations and Trends® in Machine
Learning, 2(1), 1–127.
http://doi.org/10.1561/2200000006
Bergstra, James, Breuleux, Olivier, Bastien, Frederic,
Lamblin, Pascal, Pascanu, Razvan, Desjardins,
Guillaume, ... Bengio, Yoshua. (2010). Theano: A
CPU and GPU Math Compiler in Python. In
Proceedings of the 9th Python in Science
Conference (pp. 3–10). Austin, Texas.
Collobert, Ronan, Weston, Jason, Bottou, Leon,
Karlen, Michael, Kavukcuoglu, Koray, &amp; Kuksa,
Pavel. (2011). Natural Language Processing
(almost) from Scratch. The Journal of Machine
Learning Research, 12, 2493–2537. Retrieved
from http://dl.acm.org/citation.cfm?id=2078186
Hinton, Geoffrey. (2014). Dropout: A Simple Way to
Prevent Neural Networks from Overfitting. The
Journal of Machine Learning Research, 15, 1929–
1958.
Kalchbrenner, Nal, Grefenstette, Edward, &amp; Blunsom,
Phil. (2014). A Convolutional Neural Network for
Modelling Sentences. ACL, 655–665.
Levenshtein, Vladimir. (1966). Binary Codes Capable
of Correcting Deletions, Insertions, and Reversals.
Soviet Physics Doklady, 10(8), 707–710.
Rumelhart, David, Hinton, Geoffrey, &amp; Williams,
Ronald. (1986). Learning Representations by
Back-propagating Errors. Nature, 323(9), 533–
Vincent, Pascal, Larochelle, Hugo, Bengio, Yoshua,
&amp; Manzagol, Pierre-antoine. (2008). Extracting
and Composing Robust Features with Denoising
Autoencoders. Proceedings of the 25th
International Conference on Machine Learning -
ICML ’08, (July), 1096–1103.
http://doi.org/10.1145/1390156.1390294
Yosinski, Jason, Clune, Jeff, Bengio, Yoshua, &amp;
Lipson, Hod. (2014). How Transferable are
Features in Deep Neural Networks? In Advances
in Neural Information Processing Systems 27 (pp.
1–9).
Zeiler, M. D., Ranzato, M., Monga, R., Mao, M.,
Yang, K., Le, Q. V., ... Hinton, G. E. (2013). On
Rectified Linear Units for Speech Processing.
ICASSP, IEEE International Conference on
Acoustics, Speech and Signal Processing -
Proceedings, 3517–3521.
http://doi.org/10.1109/ICASSP.2013.6638312
</reference>
<page confidence="0.998125">
161
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.558026">
<title confidence="0.9990445">NCSU_SAS_SAM: Deep Encoding and for Normalization of Noisy Text</title>
<author confidence="0.999924">Samuel P Leeman-Munk James C Lester</author>
<affiliation confidence="0.9980175">Center for Educational Informatics North Carolina State University</affiliation>
<address confidence="0.999545">Raleigh, NC, USA</address>
<email confidence="0.999868">spleeman@ncsu.edu</email>
<email confidence="0.999868">lester@ncsu.edu</email>
<author confidence="0.988725">A James</author>
<affiliation confidence="0.805028">Text Analytics SAS Institute</affiliation>
<address confidence="0.998624">Cary, NC, USA</address>
<email confidence="0.999898">james.cox@sas.com</email>
<abstract confidence="0.995780352941177">As a participant in the W-NUT Lexical Normalization for English Tweets challenge, we use deep learning to address the constrained task. Specifically, we use a combination of two augmented feed forward neural networks, a flagger that identifies words to be normalized and a normalizer, to take in a single token at a time and output a corrected version of that token. Despite avoiding off-the-shelf tools trained on external data and being an entirely context-free model, our system still achieved an F1-score of 81.49%, comfortably surpassing the next runner up by 1.5% and trailing the second place model by only 0.26%.</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="true">
<authors>
<author>Timothy Baldwin</author>
<author>Marie Catherine</author>
<author>Bo Han</author>
<author>Young-Bum Kim</author>
<author>Alan Ritter</author>
<author>Wei Xu</author>
</authors>
<title>Shared Tasks of the 2015 Workshop on Noisy User-generated Text: Twitter Lexical Normalization and Named Entity Recognition.</title>
<date>2015</date>
<booktitle>In Proceedings of the Workshop on Noisy Usergenerated Text (WNUT 2015).</booktitle>
<location>Beijing, China.</location>
<contexts>
<context position="1597" citStr="Baldwin et al., 2015" startWordPosition="245" endWordPosition="248">th of social media, web forums, and online reviews has spurred a growing interest in automated analysis of usergenerated text. User-generated text presents significant computational challenges because it is often highly disfluent. To address these challenges, we have begun to see a growing demand for tools and techniques to transform noisy usergenerated text into a canonical form, most recently in the Workshop on Noisy User Text at the Association for Computational Linguistics. This work describes a submission to the Lexical Normalization for English Tweets challenge as part of this workshop (Baldwin et al., 2015) Motivated by the success of prior deep neural network architectures, particularly denoising autoencoders, we have developed an approach to transform noisy user-generated text into a canonical form with a feed-forward neural network augmented with a projection layer (Collobert et al., 2011; Kalchbrenner, Grefenstette, &amp; Blunsom, 2014; Vincent, Larochelle, Bengio, &amp; Manzagol, 2008). The model performs a character-level analysis on each word of the input. The absence of hand-engineered features and the avoidance of direct and indirect external data make this model unique among the three topperfo</context>
</contexts>
<marker>Baldwin, Catherine, Han, Kim, Ritter, Xu, 2015</marker>
<rawString>Baldwin, Timothy, Catherine, Marie, Han, Bo, Kim, Young-Bum, Ritter, Alan, &amp; Xu, Wei. (2015). Shared Tasks of the 2015 Workshop on Noisy User-generated Text: Twitter Lexical Normalization and Named Entity Recognition. In Proceedings of the Workshop on Noisy Usergenerated Text (WNUT 2015). Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Frederic Bastien</author>
<author>Pascal Lamblin</author>
<author>Razvan Pascanu</author>
<author>James Bergstra</author>
<author>Ian Goodfellow</author>
<author>Arnaud Bergeron</author>
</authors>
<title>Theano: New Features and Speed Improvements.</title>
<date>2012</date>
<booktitle>In Deep Learning and Unsupervised Feature Learning NIPS 2012 Workshop</booktitle>
<pages>1--10</pages>
<note>Retrieved from http://arxiv.org/abs/1211.5590v1</note>
<contexts>
<context position="14386" citStr="Bastien et al., 2012" startWordPosition="2414" endWordPosition="2417">and predicts a word that is not present in the dictionary, we replace it with the closest word in the dictionary according to Levenshtein distance (Levenshtein, 1966). Ties are resolved based on which word comes first in the dictionary. Because Python’s set function, which does not guarantee a specific order of its contents, is used to construct the dictionary, the dictionary’s order is not predictable and thus ties are resolved unpredictably. 3 Settings and Evaluation The model was implemented in Theano, a Python library for fast evaluation of multidimensional arrays using matrix operations (Bastien et al., 2012; Bergstra et al., 2010). We used Theano’s implementation of backpropagation to train our model. For our window size, we selected 25 characters, which is large enough to completely represent 99.9% of the tokens in the training data while remaining computationally feasible. There are also a number of hyperparameters: the number and size of hidden layers, the size of character embeddings, and the dropout rate. We tried various combinations of values between 50 and 6000 for the size and 1 and 4 for the number of hidden layers in both our Normalizer and Flagger. Some combinations we tried can be s</context>
</contexts>
<marker>Bastien, Lamblin, Pascanu, Bergstra, Goodfellow, Bergeron, 2012</marker>
<rawString>Bastien, Frederic, Lamblin, Pascal, Pascanu, Razvan, Bergstra, James, Goodfellow, Ian, Bergeron, Arnaud, ... Bengio, Yoshua. (2012). Theano: New Features and Speed Improvements. In Deep Learning and Unsupervised Feature Learning NIPS 2012 Workshop (pp. 1–10). Retrieved from http://arxiv.org/abs/1211.5590v1</rawString>
</citation>
<citation valid="true">
<authors>
<author>Yoshua Bengio</author>
</authors>
<title>Learning Deep Architectures for AI. Foundations and Trends®</title>
<date>2009</date>
<booktitle>in Machine Learning,</booktitle>
<volume>2</volume>
<issue>1</issue>
<pages>1--127</pages>
<contexts>
<context position="4693" citStr="Bengio, 2009" startWordPosition="767" endWordPosition="768">ural network forms the basis of our model. A deep feedforward neural network takes a vector of numbers as input. This vector is known as a layer and each value within it is a neuron. The network A deep feed-forward neural network can contain any number of hidden layers, each going through the same process, multiplying by a matrix of weights and transforming via a non-linearity. Hidden layers may also be of any size. Multiple applications of learnable weight matrices and non-linear transformations together allow a deep neural network to represent complex relationships between input and output (Bengio, 2009). Deep feed-forward neural networks are trained by backpropagation. Backpropagation is a training method by which the gradient of any given weight in a network can be calculated from the error between the output of the network and a gold standard. It is described in more detail in (Rumelhart, Hinton, &amp; Williams, 1986). 2.2 The Normalizer Our use of deep feed-forward neural networks for the task of normalization is inspired by the success of denoising autoencoders. (Vincent et al., 2008). Denoising autoencoders are neural Figure 1: A flowchart detailing the process of normalizing a word. Inform</context>
</contexts>
<marker>Bengio, 2009</marker>
<rawString>Bengio, Yoshua. (2009). Learning Deep Architectures for AI. Foundations and Trends® in Machine Learning, 2(1), 1–127. http://doi.org/10.1561/2200000006</rawString>
</citation>
<citation valid="true">
<authors>
<author>James Bergstra</author>
<author>Olivier Breuleux</author>
<author>Frederic Bastien</author>
<author>Pascal Lamblin</author>
<author>Razvan Pascanu</author>
<author>Guillaume Desjardins</author>
</authors>
<date>2010</date>
<booktitle>Theano: A CPU and GPU Math Compiler in Python. In Proceedings of the 9th Python in Science Conference (pp. 3–10).</booktitle>
<location>Austin, Texas.</location>
<contexts>
<context position="14410" citStr="Bergstra et al., 2010" startWordPosition="2418" endWordPosition="2421">at is not present in the dictionary, we replace it with the closest word in the dictionary according to Levenshtein distance (Levenshtein, 1966). Ties are resolved based on which word comes first in the dictionary. Because Python’s set function, which does not guarantee a specific order of its contents, is used to construct the dictionary, the dictionary’s order is not predictable and thus ties are resolved unpredictably. 3 Settings and Evaluation The model was implemented in Theano, a Python library for fast evaluation of multidimensional arrays using matrix operations (Bastien et al., 2012; Bergstra et al., 2010). We used Theano’s implementation of backpropagation to train our model. For our window size, we selected 25 characters, which is large enough to completely represent 99.9% of the tokens in the training data while remaining computationally feasible. There are also a number of hyperparameters: the number and size of hidden layers, the size of character embeddings, and the dropout rate. We tried various combinations of values between 50 and 6000 for the size and 1 and 4 for the number of hidden layers in both our Normalizer and Flagger. Some combinations we tried can be seen in the results secti</context>
</contexts>
<marker>Bergstra, Breuleux, Bastien, Lamblin, Pascanu, Desjardins, 2010</marker>
<rawString>Bergstra, James, Breuleux, Olivier, Bastien, Frederic, Lamblin, Pascal, Pascanu, Razvan, Desjardins, Guillaume, ... Bengio, Yoshua. (2010). Theano: A CPU and GPU Math Compiler in Python. In Proceedings of the 9th Python in Science Conference (pp. 3–10). Austin, Texas.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Ronan Collobert</author>
<author>Jason Weston</author>
<author>Leon Bottou</author>
<author>Michael Karlen</author>
<author>Koray Kavukcuoglu</author>
<author>Pavel Kuksa</author>
</authors>
<title>Natural Language Processing (almost) from Scratch.</title>
<date>2011</date>
<journal>The Journal of Machine Learning Research,</journal>
<volume>12</volume>
<pages>2493--2537</pages>
<note>Retrieved from http://dl.acm.org/citation.cfm?id=2078186</note>
<contexts>
<context position="1887" citStr="Collobert et al., 2011" startWordPosition="288" endWordPosition="291">g demand for tools and techniques to transform noisy usergenerated text into a canonical form, most recently in the Workshop on Noisy User Text at the Association for Computational Linguistics. This work describes a submission to the Lexical Normalization for English Tweets challenge as part of this workshop (Baldwin et al., 2015) Motivated by the success of prior deep neural network architectures, particularly denoising autoencoders, we have developed an approach to transform noisy user-generated text into a canonical form with a feed-forward neural network augmented with a projection layer (Collobert et al., 2011; Kalchbrenner, Grefenstette, &amp; Blunsom, 2014; Vincent, Larochelle, Bengio, &amp; Manzagol, 2008). The model performs a character-level analysis on each word of the input. The absence of hand-engineered features and the avoidance of direct and indirect external data make this model unique among the three topperforming models in the constrained task. This paper is organized as follows. In Section 2 we describe each component of our model. In Section 3 we describe the specific instantiation of our model, and in Section 4 we present and discuss results. 2 Architecture and Components Our model consist</context>
<context position="7351" citStr="Collobert et al., 2011" startWordPosition="1195" endWordPosition="1198">sing autoencoder. It reads the character sequence that describes the word and encodes it 155 internally, outputting the denoised (normalized) version. It accomplishes this in three sets of layers. First the character projection layer takes a string and represents it as a fixed-length numeric vector. Next, a feed-forward neural network converts the data into its internal representation and, with a special output layer, into a denoised version of the input. Figure 2 shows a diagram of the Normalizer’s architecture. The first step of the Normalizer is performed by the character projection layer (Collobert et al., 2011). The character projection layer learns floating point vector representations of characters, which it concatenates into one large floating point vector word representation. In our example, the letter “u” is represented by n floating point numbers. For example, if n = 3 the representation for “u” might be [0.1, -1.2, -0.3]. This vector was chosen arbitrarily, but in the actual model, values are learned in training. The representations allow more information to be associated with a character than a simple numeric index. In this simple example, the word “u” is composed of one character, but if it</context>
</contexts>
<marker>Collobert, Weston, Bottou, Karlen, Kavukcuoglu, Kuksa, 2011</marker>
<rawString>Collobert, Ronan, Weston, Jason, Bottou, Leon, Karlen, Michael, Kavukcuoglu, Koray, &amp; Kuksa, Pavel. (2011). Natural Language Processing (almost) from Scratch. The Journal of Machine Learning Research, 12, 2493–2537. Retrieved from http://dl.acm.org/citation.cfm?id=2078186</rawString>
</citation>
<citation valid="true">
<authors>
<author>Geoffrey Hinton</author>
</authors>
<title>Dropout: A Simple Way to Prevent Neural Networks from Overfitting.</title>
<date>2014</date>
<journal>The Journal of Machine Learning Research,</journal>
<volume>15</volume>
<contexts>
<context position="15457" citStr="Hinton, 2014" startWordPosition="2598" endWordPosition="2599">n 50 and 6000 for the size and 1 and 4 for the number of hidden layers in both our Normalizer and Flagger. Some combinations we tried can be seen in the results section. Especially large sizes and numbers of layers proved to require more memory than our GPU could support, and training them on our CPU was exceptionally slow. We also tried 50% and 75% dropout, meaning that during training we randomly excluded hidden nodes from consideration at each 157 layer. Dropout has been shown to improve performance by discouraging overfitting on the training data, and 50% and 75% are common dropout rates (Hinton, 2014). We found the highest F1 score on the validation data for the Normalizer with two hidden layers of size 2000 each and 50% dropout. This was close to the maximum size our GPU could support without reducing the batch size to be too small to take advantage of the parallelism. The Flagger’s highest score was found at two hidden layers of size 1000 each and 75% dropout. Attempts to provide hidden layers of different sizes consistently found inferior results. For the size of each embedding in the character projection layer, 10 had proven effective earlier in a simpler unpublished Twitter part-of-sp</context>
</contexts>
<marker>Hinton, 2014</marker>
<rawString>Hinton, Geoffrey. (2014). Dropout: A Simple Way to Prevent Neural Networks from Overfitting. The Journal of Machine Learning Research, 15, 1929– 1958.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Nal Kalchbrenner</author>
<author>Edward Grefenstette</author>
<author>Phil Blunsom</author>
</authors>
<title>A Convolutional Neural Network for Modelling Sentences.</title>
<date>2014</date>
<pages>655--665</pages>
<publisher>ACL,</publisher>
<contexts>
<context position="1932" citStr="Kalchbrenner, Grefenstette, &amp; Blunsom, 2014" startWordPosition="292" endWordPosition="296">echniques to transform noisy usergenerated text into a canonical form, most recently in the Workshop on Noisy User Text at the Association for Computational Linguistics. This work describes a submission to the Lexical Normalization for English Tweets challenge as part of this workshop (Baldwin et al., 2015) Motivated by the success of prior deep neural network architectures, particularly denoising autoencoders, we have developed an approach to transform noisy user-generated text into a canonical form with a feed-forward neural network augmented with a projection layer (Collobert et al., 2011; Kalchbrenner, Grefenstette, &amp; Blunsom, 2014; Vincent, Larochelle, Bengio, &amp; Manzagol, 2008). The model performs a character-level analysis on each word of the input. The absence of hand-engineered features and the avoidance of direct and indirect external data make this model unique among the three topperforming models in the constrained task. This paper is organized as follows. In Section 2 we describe each component of our model. In Section 3 we describe the specific instantiation of our model, and in Section 4 we present and discuss results. 2 Architecture and Components Our model consists of three components: a Normalizer that enco</context>
</contexts>
<marker>Kalchbrenner, Grefenstette, Blunsom, 2014</marker>
<rawString>Kalchbrenner, Nal, Grefenstette, Edward, &amp; Blunsom, Phil. (2014). A Convolutional Neural Network for Modelling Sentences. ACL, 655–665.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Vladimir Levenshtein</author>
</authors>
<title>Binary Codes Capable of Correcting Deletions, Insertions, and Reversals.</title>
<date>1966</date>
<journal>Soviet Physics Doklady,</journal>
<volume>10</volume>
<issue>8</issue>
<pages>707--710</pages>
<contexts>
<context position="13932" citStr="Levenshtein, 1966" startWordPosition="2344" endWordPosition="2345">w to represent. In particular “lmfao” expands to an impressive 27 characters, but if the Normalizer predicts only the first 25 characters, the Conformer can easily select the correct token. To correct these small normalizer errors we construct the Conformer by collecting a dictionary from the gold standard training data. The dictionary is simply a list of all the unique words in the gold standard data. Then at runtime, whenever the Normalizer runs and predicts a word that is not present in the dictionary, we replace it with the closest word in the dictionary according to Levenshtein distance (Levenshtein, 1966). Ties are resolved based on which word comes first in the dictionary. Because Python’s set function, which does not guarantee a specific order of its contents, is used to construct the dictionary, the dictionary’s order is not predictable and thus ties are resolved unpredictably. 3 Settings and Evaluation The model was implemented in Theano, a Python library for fast evaluation of multidimensional arrays using matrix operations (Bastien et al., 2012; Bergstra et al., 2010). We used Theano’s implementation of backpropagation to train our model. For our window size, we selected 25 characters, w</context>
</contexts>
<marker>Levenshtein, 1966</marker>
<rawString>Levenshtein, Vladimir. (1966). Binary Codes Capable of Correcting Deletions, Insertions, and Reversals. Soviet Physics Doklady, 10(8), 707–710.</rawString>
</citation>
<citation valid="true">
<authors>
<author>David Rumelhart</author>
<author>Geoffrey Hinton</author>
<author>Ronald Williams</author>
</authors>
<title>Learning Representations by Back-propagating Errors.</title>
<date>1986</date>
<journal>Nature,</journal>
<volume>323</volume>
<issue>9</issue>
<pages>533</pages>
<contexts>
<context position="5011" citStr="Rumelhart, Hinton, &amp; Williams, 1986" startWordPosition="817" endWordPosition="821">me process, multiplying by a matrix of weights and transforming via a non-linearity. Hidden layers may also be of any size. Multiple applications of learnable weight matrices and non-linear transformations together allow a deep neural network to represent complex relationships between input and output (Bengio, 2009). Deep feed-forward neural networks are trained by backpropagation. Backpropagation is a training method by which the gradient of any given weight in a network can be calculated from the error between the output of the network and a gold standard. It is described in more detail in (Rumelhart, Hinton, &amp; Williams, 1986). 2.2 The Normalizer Our use of deep feed-forward neural networks for the task of normalization is inspired by the success of denoising autoencoders. (Vincent et al., 2008). Denoising autoencoders are neural Figure 1: A flowchart detailing the process of normalizing a word. Information flows from left to right and ellipses represent data objects while rectangles represent processes. multiplies the input layer by a matrix of weights to return another vector. This new vector is then transformed by a non-linearity. A number of functions can serve as the non-linearity, including the sigmoid and t</context>
<context position="11656" citStr="Rumelhart et al., 1986" startWordPosition="1971" endWordPosition="1974">total negative log likelihood of the gold standard. Mathematically, our objective function is cost = − 𝑙𝑛 𝑝 P∈P Where p is an element in P, the vector of the probabilities of each gold standard letter. So, if our model predicts “y” as 75% likely for character 1, “o” as 95% likely for character 2, and “u” as 89% likely for character 3 in our window of size 3, the negative log likelihoods calculated as (.29, .05, .12) are summed to get the error. This sum error gives a simple measurement of performance to optimize, which backpropagates through the model to learn all the weights described above (Rumelhart et al., 1986). 2.3 The Flagger The Flagger identifies what does and does not require normalization. The vast majority of the training data (91%) does not require normalization, so returning the reconstructed encoding of every word would risk incorrectly regenerating an already canonical token. The Flagger has the same general structure as the Normalizer itself except for the final layer. Instead of generating text at the last layer, a softmax layer predicts whether the token should be normalized at all. Thus, the Flagger’s output layer is two neurons in size, one representing the flag “Do Normalize,” and a</context>
</contexts>
<marker>Rumelhart, Hinton, Williams, 1986</marker>
<rawString>Rumelhart, David, Hinton, Geoffrey, &amp; Williams, Ronald. (1986). Learning Representations by Back-propagating Errors. Nature, 323(9), 533–</rawString>
</citation>
<citation valid="true">
<authors>
<author>Pascal Vincent</author>
<author>Hugo Larochelle</author>
<author>Yoshua Bengio</author>
<author>Pierre-antoine Manzagol</author>
</authors>
<title>Extracting and Composing Robust Features with Denoising Autoencoders.</title>
<date>2008</date>
<booktitle>Proceedings of the 25th International Conference on Machine Learning -ICML ’08,</booktitle>
<pages>1096--1103</pages>
<contexts>
<context position="1979" citStr="Vincent, Larochelle, Bengio, &amp; Manzagol, 2008" startWordPosition="297" endWordPosition="302">xt into a canonical form, most recently in the Workshop on Noisy User Text at the Association for Computational Linguistics. This work describes a submission to the Lexical Normalization for English Tweets challenge as part of this workshop (Baldwin et al., 2015) Motivated by the success of prior deep neural network architectures, particularly denoising autoencoders, we have developed an approach to transform noisy user-generated text into a canonical form with a feed-forward neural network augmented with a projection layer (Collobert et al., 2011; Kalchbrenner, Grefenstette, &amp; Blunsom, 2014; Vincent, Larochelle, Bengio, &amp; Manzagol, 2008). The model performs a character-level analysis on each word of the input. The absence of hand-engineered features and the avoidance of direct and indirect external data make this model unique among the three topperforming models in the constrained task. This paper is organized as follows. In Section 2 we describe each component of our model. In Section 3 we describe the specific instantiation of our model, and in Section 4 we present and discuss results. 2 Architecture and Components Our model consists of three components: a Normalizer that encodes the input and then reconstructs it in norma</context>
<context position="5184" citStr="Vincent et al., 2008" startWordPosition="845" endWordPosition="848">ar transformations together allow a deep neural network to represent complex relationships between input and output (Bengio, 2009). Deep feed-forward neural networks are trained by backpropagation. Backpropagation is a training method by which the gradient of any given weight in a network can be calculated from the error between the output of the network and a gold standard. It is described in more detail in (Rumelhart, Hinton, &amp; Williams, 1986). 2.2 The Normalizer Our use of deep feed-forward neural networks for the task of normalization is inspired by the success of denoising autoencoders. (Vincent et al., 2008). Denoising autoencoders are neural Figure 1: A flowchart detailing the process of normalizing a word. Information flows from left to right and ellipses represent data objects while rectangles represent processes. multiplies the input layer by a matrix of weights to return another vector. This new vector is then transformed by a non-linearity. A number of functions can serve as the non-linearity, including the sigmoid and the hyperbolic tangent, but our model uses a rectified linear unit, given by the following expression. 𝑦= max 𝑥, 0 The rectified linear unit has been successful in a number o</context>
</contexts>
<marker>Vincent, Larochelle, Bengio, Manzagol, 2008</marker>
<rawString>Vincent, Pascal, Larochelle, Hugo, Bengio, Yoshua, &amp; Manzagol, Pierre-antoine. (2008). Extracting and Composing Robust Features with Denoising Autoencoders. Proceedings of the 25th International Conference on Machine Learning -ICML ’08, (July), 1096–1103. http://doi.org/10.1145/1390156.1390294</rawString>
</citation>
<citation valid="true">
<authors>
<author>Jason Yosinski</author>
<author>Jeff Clune</author>
<author>Yoshua Bengio</author>
<author>Hod Lipson</author>
</authors>
<title>How Transferable are Features in Deep Neural Networks?</title>
<date>2014</date>
<booktitle>In Advances in Neural Information Processing Systems</booktitle>
<volume>27</volume>
<pages>1--9</pages>
<contexts>
<context position="18183" citStr="Yosinski, Clune, Bengio, &amp; Lipson, 2014" startWordPosition="3050" endWordPosition="3055">ddings with their previously trained values. When we trained the full model using two hidden layers, we initialized both the character projection layer and the weights from the projected input to the first hidden layer with the values learned before. The model continued to learn all the weights it used. Pretrained weights continued to be trained in the full model, although “freezing” some pretrained weights after pretraining and only training later weights in the full model has shown success when working with large amounts of unsupervised data and may be worthwhile to consider in future work (Yosinski, Clune, Bengio, &amp; Lipson, 2014). Running on an NVIDIA GeForce GTX 680 GPU with 2 GB of onboard memory, training the Normalizer took about six hours. We do not include CPU and RAM specifications because they were not heavily utilized in the GPU implementation. The Flagger was considerably faster to train than the Normalizer, taking only a little over half an hour. 4 Results and Discussion The model earned third place in the competition, with scores very close to the second place model. The model’s results in the competition compared to the first, second, and fourth place models is shown in Table 1. The precision scores are </context>
</contexts>
<marker>Yosinski, Clune, Bengio, Lipson, 2014</marker>
<rawString>Yosinski, Jason, Clune, Jeff, Bengio, Yoshua, &amp; Lipson, Hod. (2014). How Transferable are Features in Deep Neural Networks? In Advances in Neural Information Processing Systems 27 (pp. 1–9).</rawString>
</citation>
<citation valid="true">
<authors>
<author>M D Zeiler</author>
<author>M Ranzato</author>
<author>R Monga</author>
<author>M Mao</author>
<author>K Yang</author>
<author>Q V Le</author>
</authors>
<title>On Rectified Linear Units for Speech Processing.</title>
<date>2013</date>
<booktitle>ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing -Proceedings,</booktitle>
<pages>3517--3521</pages>
<contexts>
<context position="5856" citStr="Zeiler et al., 2013" startWordPosition="953" endWordPosition="956">chart detailing the process of normalizing a word. Information flows from left to right and ellipses represent data objects while rectangles represent processes. multiplies the input layer by a matrix of weights to return another vector. This new vector is then transformed by a non-linearity. A number of functions can serve as the non-linearity, including the sigmoid and the hyperbolic tangent, but our model uses a rectified linear unit, given by the following expression. 𝑦= max 𝑥, 0 The rectified linear unit has been successful in a number of natural language tasks such as speech processing (Zeiler et al., 2013), and it was effective in an unpublished part-of-speech tagging model we developed. The transformed vector is referred to as a hidden layer because its values are never directly observed in the normal functioning of the model. networks whose output is the same as their input. That is, they specialize in developing a robust encoding of an input such that the input can be reconstructed from the encoding alone. The denoising aspect refers to the fact that to encourage robustness, denoising autoencoders are given inputs that have been deliberately corrupted, or “noised” and are expected to reconst</context>
</contexts>
<marker>Zeiler, Ranzato, Monga, Mao, Yang, Le, 2013</marker>
<rawString>Zeiler, M. D., Ranzato, M., Monga, R., Mao, M., Yang, K., Le, Q. V., ... Hinton, G. E. (2013). On Rectified Linear Units for Speech Processing. ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing -Proceedings, 3517–3521.</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>