<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.219230">
<title confidence="0.997914">
NCSU_SAS_WOOKHEE: A Deep Contextual
Long-Short Term Memory Model
for Text Normalization
</title>
<author confidence="0.992561">
Wookhee Min Bradford W. Mott
</author>
<affiliation confidence="0.795999">
Center for Educational Informatics
North Carolina State University
</affiliation>
<address confidence="0.899729">
Raleigh, NC, USA
</address>
<email confidence="0.999564">
{wmin, bwmott}@ncsu.edu
</email>
<sectionHeader confidence="0.995652" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.99986605">
To address the challenges of normalizing
online conversational texts prevalent in
social media, we propose a contextual
long-short term memory (LSTM) recur-
rent neural network based approach,
augmented with a self-generated diction-
ary normalization technique. Our ap-
proach utilizes a sequence of characters
as well as the part-of-speech associated
with words without harnessing any exter-
nal lexical resources. This work is evalu-
ated on the English Tweet data set pro-
vided by the ACL 2015 W-NUT Normal-
ization of Noisy Text shared task. The re-
sults, by achieving second place (F1
score: 81.75%) in the constrained track of
the competition, indicate that the pro-
posed LSTM-based approach is a promis-
ing tool for normalizing non-standard
language.
</bodyText>
<sectionHeader confidence="0.999184" genericHeader="introduction">
1 Introduction
</sectionHeader>
<bodyText confidence="0.99785338">
Recent years have seen increasing use of online
social media such as Twitter and Facebook that
has generated a growing body of text where non-
standard language is prevalent. These non-
standard lexical items take many different forms,
including unintentional errors based on users’
cognitive misconceptions and typographical er-
rors, and intentional non-canonical language
such as abbreviations, word lengthening by du-
plication of characters, Internet slang, phonetic
substitutions, and creative use of language
(Chrupała, 2014; Owoputi et al., 2013).
A key challenge posed by these non-standard
texts is the negative impact on traditional natural
language processing (NLP) pipeline processes,
evidenced by noticeable underperformance of
their predictive accuracy in various domains such
as part-of-speech tagging (Gimpel et al., 2011)
and named entity recognition (Ritter et al., 2011)
compared to more standard text. As an approach
to addressing this challenge, text normalization
techniques have been widely investigated, rang-
ing from extracting domain specific lexical vari-
ants (Gouws et al., 2011), unified letter transfor-
mation (Liu et al., 2011), dictionary based meth-
ods using string substitution (Han et al., 2012) in
an unsupervised manner, to character-level edit
operation predictions utilizing conditional ran-
dom fields in a supervised manner (Chrupała,
2014).
Because language data consists of sequential
information, such as streams of characters and
sequences of words, many NLP approaches lev-
erage computational models that can effectively
deal with temporal data, such as hidden Markov
models and conditional random fields (TŠck-
stršm, 2013; Chrupała, 2014). More recently,
deep learning models (e.g., multi-layer feed-
forward neural networks, recurrent neural net-
works, recursive neural networks) have been
used in NLP to achieve state-of-the-art perfor-
mance in areas such as speech recognition (Hin-
ton et al., 2012) and sentiment analysis (Socher
et al., 2013). The success of deep learning has
been attainable with the emergence of effective
training methods for deep networks, such as pre-
training (Vincent et al., 2010) and optimization
techniques (Zeiler, 2010; Martens and Sutskever,
2011) that significantly diminish problems asso-
ciated with vanishing and exploding gradient that
</bodyText>
<page confidence="0.983342">
111
</page>
<note confidence="0.988463">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 111–119,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<bodyText confidence="0.999751727272727">
are often observed in multi-layer neural network
training.
In this work, we leverage long-short term
memory models (LSTMs) (Hochreiter and
Schmidhuber, 1997; Graves, 2012), a variant of
recurrent neural networks, to conduct text nor-
malization on the data set given from the W-
NUT English lexical normalization shared task
(Baldwin et al., 2015). We additionally harness
the part-of-speech tagger created by Noah’s Ark
research team (Owoputi et al., 2013), a free re-
source to the constrained task. Similar to Chrupa-
ła’s work that predicts Levenshtein edit opera-
tions between canonical and non-canonical forms
of words (Chrupała, 2014), this proposed ap-
proach predicts word-level edit operations based
on character-level inputs. The proposed approach
is novel compared to previous work from four
perspectives: (1) it utilizes LSTMs to predict the
word-level edit operations, along with a diction-
ary induced from the training set, (2) it takes as
input the surrounding words as well as the cur-
rent word to capture contextual information of
the predicted word, while any additional contex-
tual information (e.g., part-of-speech tags) is
treated as heading characters of the word, (3)
character and part-of-speech embeddings are
learned on the fly in the normalization task in-
stead of having them trained in a separate model,
and (4) the self-generated dictionary based nor-
malization as an antecedent step provides statis-
tically significant F1 gains over the standalone
computational model.
</bodyText>
<sectionHeader confidence="0.94139" genericHeader="method">
2 System Architecture
</sectionHeader>
<bodyText confidence="0.9999901875">
Our proposed system consists of three steps. In
the first step, it filters out domain-specific enti-
ties such as tokens beginning with @, hash-tags,
and URLs. Next, the system searches for words
contained in a dictionary generated solely from
the training data and normalizes them when ap-
propriate. If words are not normalized in the pre-
vious steps, they are passed to the third step,
where an LSTM model predicts the canonical
form of the word, utilizing the word itself and
surrounding words (the previous and following
word). In the next sub-sections, we detail how
each of the three steps collaboratively operate
and explain how the LSTM model is learned
based on the training set along with a high-level
illustration of the architecture.
</bodyText>
<subsectionHeader confidence="0.998209">
2.1 Sequence Flow
</subsectionHeader>
<bodyText confidence="0.9999918">
The proposed model operates in three primary
phases: (1) domain-specific entity filtering, (2)
dictionary-based normalization, and (3) LSTM-
based normalization.
The first step performs a simple preprocessing
of input words. First, every word is converted to
the corresponding lowercase word. Second,
words that are hash-tags, at-mentions, or URLs
are filtered out and left as-is. This preprocessing
is useful since the W-NUT normalization task
guideline suggests not changing domain-specific
entities, and including them could possibly inject
noise into predictive models for non-filtered
word prediction.
Second, to conduct the dictionary-based nor-
malization, a dictionary is generated from the
training set as an index of raw tokens with a list
of their normalized forms. For instance, words
such as “ur” and “no” are multiple mapped
words, where multiple canonical forms are ob-
served in the training set such as [“your”, “you
are”] and [“no”, “know”, “not”], respectively.
This is mainly because they are normalized dif-
ferently depending on the context used in tweets.
On the other hand, words that have a single
mapped word are unique in terms of post-
normalization form. We denote the first type of
words (multiple mapping) as ambiguous words,
and the second type of words (single mapping) as
unique words. We use this mapping information
as a criterion to decide whether to normalize
words based on the dictionary or pass the deci-
sion over to the LSTM model in the third step. If
a word in the test set turns out to be a unique
word, we label the word with the corresponding
mapping as defined in the dictionary; otherwise,
we pass the word to the LSTM model. This is
based on the assumption that these unique words
are more likely to have the same unique form in
unseen texts.
Finally, once the second normalization step is
completed, only words that are either ambiguous
or out-of-vocabulary determined by the training
set-based dictionary are left and sent to the
LSTM model. For ambiguous words, it is im-
portant that the model accurately identify the
right usage of the words considering context in
the tweets. Additionally, it is necessary to cap-
ture common patterns of standard words, so that
the model can predict canonical forms of words
</bodyText>
<page confidence="0.995984">
112
</page>
<figureCaption confidence="0.90719075">
Figure 1: The input features based on characters and POSs and output label (edit operations) used to
find the normalized form of nth word (Wordn). The preceding word (Wordn-1) and the following word
(Wordn+1)’s POS and constituting characters are combined with the current word (Wordn) as input fea-
tures, where Wordn-1, Wordn, and Wordn+1 has x, y, and z characters, respectively.
</figureCaption>
<bodyText confidence="0.9998602">
even when they are not seen in the training set.
In this work, we formulate an LSTM model to
predict an integrated list of edit operations of a
word to become a standard word, leveraging its
contextual information.
</bodyText>
<subsectionHeader confidence="0.99762">
2.2 Data Set Encoding for LSTM
</subsectionHeader>
<bodyText confidence="0.999991883116883">
A preliminary step to induce an LSTM model is
to encode the data set in a trainable format (i.e.,
specification of input features and output labels).
We define the input format as a list of sequential,
lowercased characters that compose the previous
word, current word, and following word. Each
character is mapped to a unique index (0–66),
since there are a total of 67 different characters
in the training data after the preprocessing step
described in 2.1. If the current word does not
have a previous or next word (e.g., the first or
last word in a tweet), a padding character is as-
signed for the previous or following word to
have a consistent format. In this work, we addi-
tionally consider a word’s part-of-speech (POS)
as extra input to the model, as previous literature
(e.g., Yarowsky, 1997; TŠckstršm, 2013) indi-
cates POS tags can improve performance in other
natural language processing tasks, such as text-
to-speech synthesis and NLP parsers. We use an
off-the-shelf POS tagger that features Brown
clustering: the CMU Twitter Part-of-Speech
Tagger, which achieves a state-of-the-art tagging
result of 93% on a Twitter benchmark data set
(Owoputi et al., 2013). The extracted POS in-
formation is added as a distinct heading character
to each word, so that they are leveraged in the
LSTM models. Similar to the character padding,
we apply a POS padding for missing previous or
next words. Note that leveraging POSs about
words is extendable to utilize any other meta-
information, and we examine the feasibility of
applying this technique with POS tags in the con-
text of text normalization. The input encoding for
predicting edit operations of the current word
(Wordn) is described in Figure 1. To summarize,
the number of inputs in a sequence is 3 + x + y +
z, where x, y and z are the number of characters
of the previous, current, and following word,
respectively, while 3 is derived from the POS
tags of all three words.
Encoding the output is based on the Le-
venshtein distance algorithm (Levenshtein, 1966)
that supports three operations: insert, replace,
and delete, inspired by Chrupała’s text normali-
zation work (Chrupała, 2014). In this work, we
reformulate his approach to predict word-level
edit operations instead of character-level edit
operations, by which the model predicts a label
for an individual token. In the character-level
prediction, to correctly normalize a token, it re-
quires all correction predictions on every charac-
ter that belongs to a word (i.e., probabilities get
multiplied), whereas the word-level prediction
requires one prediction per token.
Once a training sample is given, the Le-
venshtein distance algorithm calculates the re-
quired edit operations to convert the possibly
non-standard word into the corresponding canon-
ical form. For example, “dese” and “dey” with
the canonical forms of “these” and “they”, re-
spectively, have edit operations of “in-
sert_t_replace_h, none, none, none, none” and
“insert_t_replace_h, none, none, none” (a com-
ma is used as a delimiter for characters). Note
that the insert operation only supports inserting a
character before the current character, so to sup-
port insertions at the end of a word, every input
word (e.g., “doin”) is concatenated with an emp-
ty character (e.g., “doin ”), and edit operations
are applied on the empty character (e.g., “none,
none, none, none, insert_g”). Since more class
labels make this multi-class classification task
more challenging, it is important to have an op-
timized set of edit operation labels, while the
model should be still capable of converting to
canonical words based on the given edit opera-
</bodyText>
<page confidence="0.995647">
113
</page>
<bodyText confidence="0.999992296296296">
tion. For the preceding example, a way to shrink
the label size is omitting repeated none opera-
tions at the end of the string. With this optimiza-
tion applied, both “dese” and “dey” have the
same edit operation of “insert_t_replace_h” ig-
noring all following “none”s. From this, the
model can successfully decode the operation by
replacing the first character of “d” with “th” and
appending following stream of characters from
each example, thereby constructing “these” and
“they”, respectively. A more pronounced benefit
of this technique can be found when there is no
change required in terms of the edit operations.
No change examples will have a single common
label of nothing; otherwise, a series of none op-
erations will be generated as independent labels
based on the length of the word.
Another challenge in the edit operation ap-
proach lies in dealing with variants of repeated
occurrence of the same character (e.g., “sooo”,
“soooo”) often used to emphasize the word, since
all required edit operations will be treated as dif-
ferent labels (e.g., “none, none, delete, delete”,
“none, none, delete, delete, delete”) in spite of
similar forms of edits (note that we omit the last
none due to the previously-mentioned optimiza-
tion). To address this challenge, we attempt to
replace characters that subsequently occur more
than two times with a single character or double
characters, and see if the converted word exists
in the dictionary (in this work, the double charac-
ter conversions have a higher priority over the
single character conversion, if both exist in the
dictionary). If it appears in the dictionary, we use
the word as an input word and calculate the edit
operations based on the converted word setting;
otherwise, we use the original word as the input
word. We expect this would reduce the number
of possible labels (e.g., both “sooo” and “soooo”
are converted to “so”, as “so” is defined as a ca-
nonical form in the dictionary while “soo” is not,
and so both of them have edit operations of noth-
ing). As a result, the total number of labels ob-
tained form the training set is reduced from 706
to 694.
Training examples for LSTMs are built upon
all words except for hash-tags, at-mentions, and
URLs that are filtered in the first step, regardless
of whether a word is ambiguous or unique. In
this manner, we expect that LSTMs can capture
context information from every three-word ex-
ample and thus utilize all available contextual
dependencies when ambiguous or out-of-
vocabulary words appear in the test set.
</bodyText>
<subsectionHeader confidence="0.9875965">
2.3 Long-Short Term Memory (LSTM) for
Text Normalization
</subsectionHeader>
<bodyText confidence="0.999951096153846">
An LSTM (Hochreiter and Schmidhuber, 1997)
is a variant of recurrent neural networks (RNNs)
that is specifically designed for sequence label-
ing on temporal data. LSTM has been extended
to have a longer term memory compared to tradi-
tional RNNs by introducing a memory block that
features one or more self-connected memory
cells along with three gating units: input gate,
forget gate and output gate (Graves, 2012). Tra-
ditional RNNs often suffer from vanishing and
exploding gradient problems when training deep
networks using the backpropagation-through-
time method, and thus prevent RNNs from stor-
ing long-term dependencies from previous time
steps in the sequential data. In LSTMs, the input
and output gate modulate the incoming and out-
going signals on the memory cell, and the forget
gate controls the previous state of the memory
cell whether to remember or forget; this structure
allows it to preserve gradient information over
long periods of time, and thus effectively address
vanishing/exploding gradients that make training
difficult in standard RNNs (Graves, 2012).
We use an LSTM as our base model, as de-
scribed in Figure 2. Note that in the figure, the
previous word (Wordt-1) and the following word
(Wordt+1) are omitted due to the space limita-
tions; it is important to note that they have the
same structure as in the current word (Wordt).
For a word’s edit operation prediction, three
words and their associated POSs are fed into the
model in the form of a sequence of characters for
each word. As noted above, the POS of each
word is inserted before the first character of that
word, regarded as a heading character that pro-
vides extra information for the associated word.
When a deep learning model takes words or
characters as input, an approach to obtaining
their representation is using one-hot-encoding,
which is a bit vector whose length is the size of
the vocabulary of words or characters, where
only the associated word/character bit is on (i.e.,
1) while all other bits are off (i.e., 0). Another
popular approach is utilizing word/character em-
beddings, where their representations are learned
in the context of unsupervised language model-
ing (Mikolov et al., 2013; Pennington et al.,
2014) or supervised tasks of interest (Mesnil et
al., 2013). We choose the latter approach and
learn character embeddings using a linear projec-
tion layer while training the text normalization
LSTM model in a supervised manner. We set
</bodyText>
<page confidence="0.998486">
114
</page>
<figureCaption confidence="0.999287">
Figure 2: An illustration of the LSTM-based text normalization model
</figureCaption>
<bodyText confidence="0.999620944444444">
both the character and POS embedding size
to 256 for this task based on preliminary
analyses using a grid search.
For our base code, we utilized a Theano-
based (Bastien et al., 2012) LSTM imple-
mentation1 with a single-cell memory block
per time, which was implemented targeting
a sentiment analysis task on an IMDB data
set.
In this implementation, the input gate (𝑖t),
forget gate (𝑓t), and candidate value of the
memory content (𝑐t) at time t are computed
by Equation (1), (2), and (3), respectively, in
which W and U are weight matrices for the
input (xt) at time t and the cell output (ht-1) at
time t-1, b is the bias vector of each unit,
and σ and tanh are the logistic sigmoid and
hyperbolic tangent function, respectively:
</bodyText>
<equation confidence="0.999252666666667">
𝑖t = σ(𝑊i𝑥t + 𝑈iℎt-1 + 𝑏i)
𝑓t = σ(𝑊f𝑥t + 𝑈fℎt-1 + 𝑏f)
𝑐t = tanh(𝑊c𝑥t + 𝑈cℎt-1 + 𝑏c)
</equation>
<bodyText confidence="0.999663727272727">
Once the three vectors are computed, the
current memory cell’s state is updated to a
new state (𝑐t), by modulating the current
memory candidate value (𝑐t) via the input
gate (𝑖t) and the previous memory cell state
(𝑐t-1) via the forget gate (𝑓t). Through this
process, a memory cell decides whether to
keep or forget the previous memory state
and regulates the candidate of the current
memory state via the input gate. This step is
described in Equation (4):
</bodyText>
<equation confidence="0.9543755">
1 http://deeplearning.net/tutorial/lstm.html
𝑐t = 𝑖t𝑐t + 𝑓t𝑐t-1 (4)
</equation>
<bodyText confidence="0.999677">
In Equation (5), the output gate (𝑜t), simi-
larly calculated as in Equation (1) and (2), is
utilized to compute the cell activation (ℎt)
of the LSTM block, based on the new
memory state (𝑐t) (Equation 6):
</bodyText>
<equation confidence="0.9998845">
𝑜f = σ(𝑊o𝑥t + 𝑈0ℎt-1 + 𝑏o) (5)
ℎt = 𝑜f tanh(𝑐t) (6)
</equation>
<bodyText confidence="0.999952916666667">
In this model, as a variant of the LSTM
proposed by Graves (2012), the input and
forget gates do not take as input the previous
memory cell’s state, and the output gate
does not utilize the current memory cell’s
state, to take advantage of a computational
benefit when training models; rather, the
current memory cell’s state is only utilized
to calculate the cell’s output representation,
along with the computed vector from the
output gate (Equation 6).
As illustrated in Figure 2, a character or
POS is fed into the model at each time step,
inducing a cell output (h) and a cell state (c).
To predict the label (i.e., edit operations of a
word), the model performs an average pool-
ing (hQvg) on a sequence of computed cell
output representations (h(t-1)0 to h(t+1)z) on the
training example with the three word input
sequence, calculates posterior probabilities
of all candidate labels using the averaged
representation (hQvg) in a softmax layer, and
chooses the label with the highest posterior
probability value as prediction.
</bodyText>
<page confidence="0.99842">
115
</page>
<table confidence="0.999805">
Without Dictionary Normalization With Dictionary Normalization
Precision Recall F1 Precision Recall F1
Fold 1 0.8777 0.6735 0.7622 0.8803 0.7185 0.7912
Fold 2 0.9036 0.6546 0.7592 0.9134 0.7232 0.8072
Fold 3 0.8737 0.6352 0.7356 0.8797 0.6805 0.7674
Fold 4 0.8671 0.6501 0.7431 0.9107 0.6986 0.7907
Fold 5 0.8388 0.6867 0.7551 0.8859 0.7347 0.8032
Averaged score 0.8722 0.6600 0.7510 0.8940 0.7111 0.7919
</table>
<tableCaption confidence="0.984016">
Table 1: 5-fold cross validation results of LSTMs without dictionary normalization and with dic-
tionary normalization.
</tableCaption>
<table confidence="0.999983625">
Precision Non-contextual Model F1 Precision Contextual Model F1
Recall Recall
Fold 1 0.9032 0.6838 0.7783 0.8803 0.7185 0.7912
Fold 2 0.8776 0.7419 0.8041 0.9134 0.7232 0.8072
Fold 3 0.8988 0.6704 0.7680 0.8797 0.6805 0.7674
Fold 4 0.9209 0.6961 0.7929 0.9107 0.6986 0.7907
Fold 5 0.8589 0.7387 0.7943 0.8859 0.7347 0.8032
Averaged score 0.8919 0.7062 0.7875 0.8940 0.7111 0.7919
</table>
<tableCaption confidence="0.999706">
Table 2: 5-fold cross validation results of LSTMs: non-contextual model vs. contextual model.
</tableCaption>
<bodyText confidence="0.999837916666667">
For other parameter settings in this exper-
iment, we used 256 hidden units, 25% drop-
out rate (Srivastava et al., 2014),
ADADELTA (Zeiler, 2012) for the network
optimization, negative log-likelihood for the
cost function, and mini-batch based gradient
descent with the batch size set to 16. To
avoid overfitting, we set aside a separate
validation set, and let the training process
repeat until there is no progress within the
last ten iterations in terms of performance on
the validation set.
</bodyText>
<sectionHeader confidence="0.990358" genericHeader="method">
3 Empirical Evaluation
</sectionHeader>
<bodyText confidence="0.999966205128205">
Before submitting our test set result to the
W-NUT English lexical normalization
shared task, we ran a 5-fold cross validation
on the training set to evaluate the proposed
approach. To conduct the experiment, we
split the training set into 5 partitions based
on a Tweet-level separation, and trained an
LSTM model, iteratively using 4 out of the 5
partitions in each fold.
In the first evaluation, we examine two
variations of our approach to measure the
impact of dictionary-based normalization as
an intermediate step: (1) applying phase 1
and phase 3, in which we do not leverage
dictionary-based normalization but predict
labels based on an LSTM model after at-
mention and hash-tag and URL filtering, and
(2) applying all three phases. The evaluation
is conducted on contextual models that take
three word inputs.
Table 1 describes the result of these two
approaches for each fold. For pairwise com-
parison of the two approaches, we conduct a
Wilcoxon signed-rank test on F1 rates. The
result indicates that there is a statistically
significant improvement in F1 rates (79.19%
by achieving 5.4% marginal improvement)
for “with dictionary normalization” over
“without dictionary normalization” (Z=-
2.023, p=0.043). To examine the effects of
the LSTM-based model, we further evaluat-
ed a without-LSTM approach (phase 1 and 2
only), in which all out-of-vocabulary words
are left unchanged, and the most often ob-
served canonical form in the dictionary is
used as the label for ambiguous words (if the
frequency is tied, the first form in the hash
table is used). The average F1 score of this
dictionary only model is 0.7786; the LSTM
</bodyText>
<page confidence="0.997724">
116
</page>
<bodyText confidence="0.999892933333333">
model with the dictionary statistically out-
performs the dictionary only model (Z=-
2.023, p=0.043).
In the second evaluation, we additionally
compare another set of two variations: con-
textual model (taking surrounding words as
well as the current word) vs. non-contextual
model (only taking the current word). Table
2 summarizes the comparison on the two
approaches enriched with the dictionary
normalization. The contextual model outper-
forms the non-contextual model in terms of
the F1 score, but the difference does not
elicit a statistically significant difference
(Z=-1.214, p=0.225).
To construct a final model for the test set
prediction, we utilize an ensemble method
on contextual LSTM models with dictionary
normalization. Given a test set, we calculate
the prediction probability from each of the 5
models induced from the five-fold cross val-
idation, multiply the probability values from
the softmax layer, and choose the label with
the highest resulting probability. In the eval-
uation through the W-NUT competition, this
approach (NCSU_SAS_WOOKHEE.cm)
achieved a precision score of 91.36%, recall
score of 73.98%, and F1 score of 81.75%,
placing second in the constrained text nor-
malization track.
</bodyText>
<sectionHeader confidence="0.995606" genericHeader="conclusions">
4 Conclusion and Future Work
</sectionHeader>
<bodyText confidence="0.999990647058824">
Text normalization is a key capability for
addressing the challenges posed by noisy
text. This paper presents a contextual long-
short term memory based normalization
method, augmented with a dictionary-based
normalization technique. Evaluations with
the training set indicate that the dictionary-
based normalization significantly outper-
forms the without-dictionary model. This
method was evaluated on the English Tweet
test set offered by the W-NUT shared task,
and shows promise as a lexical normalizer
for noisy texts by achieving an F1 score of
81.75%. We conclude that inputs encoded
with a sequence of characters are a natural
fit for the LSTM’s temporal structure when
normalizing non-standard language.
In the future, it will be important to inves-
tigate if including more surrounding words
as context contributes to the model’s per-
formance, and examine possibilities of using
different types of word-level meta-data as
additional heading characters in the model.
Another direction for future work is to in-
vestigate adaptations of the LSTM model
with a self-generated dictionary. For exam-
ple, when a word is an ambiguous word, the
LSTM’s prediction is not necessarily part of
the normalization candidates given by the
dictionary for the word. A tight coupling
between the LSTM model and the candidate
list or building a separate model targeted to
only ambiguous words may significantly
increase performance.
</bodyText>
<sectionHeader confidence="0.998934" genericHeader="references">
References
</sectionHeader>
<reference confidence="0.9981016875">
Timothy Baldwin, Marie Catherine de Marneffe,
Bo Han, Young-Bum Kim, Alan Ritter, and
Wei Xu. 2015. Shared Tasks of the 2015
Workshop on Noisy User-generated Text:
Twitter Lexical Normalization and Named
Entity Recognition. In Proceedings of the
Workshop on Noisy User-generated Text
(WNUT 2015), Beijing, China.
Frederic Bastien, Pascal Lamblin, Razvan
Pascanu, James Bergstra, Ian Goodfellow,
Arnaud Bergeron, Nicolas Bouchard, David
Warde-Farley, and Yoshua Bengio. 2012.
Theano: new features and speed
improvements. arXiv preprint
arXiv:1211.5590.
Grzegorz Chrupała. Normalizing tweets with edit
scripts and recurrent neural embeddings.
2014. In Proceedings of the 52nd Annual
Meeting of the Association for Computational
Linguistics, pages 680–686. Association for
Computational Linguistics.
Kevin Gimpel, Nathan Schneider, Brendan
O&apos;Connor, Dipanjan Das, Daniel Mills, Jacob
Eisenstein, Michael Heilman, Dani Yogatama,
Jeffrey Flanigan, and Noah A. Smith. 2011.
Part-of-speech tagging for Twitter: Annota-
tion, features, and experiments. In Proceed-
ings of the 49th Annual Meeting of the Asso-
ciation for Computational Linguistics: Human
Language Technologies: short papers-Volume
2, pages 42–47. Association for Computation-
al Linguistics.
</reference>
<page confidence="0.99206">
117
</page>
<reference confidence="0.994859378640777">
Stephan Gouws, Dirk Hovy, and Donald Metz-
ler. 2011. Unsupervised mining of lexical var-
iants from noisy text. In Proceedings of the
First workshop on Unsupervised Learning in
NLP, pages 82–90. Association for Computa-
tional Linguistics.
Alex Graves. 2012. Supervised sequence label-
ling with recurrent neural networks. Heidel-
berg: Springer.
Bo Han, Paul Cook, and Timothy Baldwin.
2012. Automatically constructing a normalisa-
tion dictionary for microblogs. In Proceedings
of the 2012 joint conference on empirical
methods in natural language processing and
computational natural language learning,
pages 421–432. Association for Computation-
al Linguistics.
Geoffrey Hinton, Li Deng, Dong Yu, George E.
Dahl, Abdel-rahman Mohamed, Navdeep Jait-
ly, Andrew Senior et al. 2012. Deep neural
networks for acoustic modeling in speech
recognition: The shared views of four research
groups. Signal Processing Magazine, IEEE,
29(6): 82–97.
Sepp Hochreiter, and Jürgen Schmidhuber. 1997.
Long short-term memory. Neural computa-
tion, 9(8): 1735–1780.
V. I. Levenshtein. 1966. Binary codes capable of
correcting deletions, insertions, and reversals.
Soviet physics doklady, 10(8): 707–710.
Fei Liu, Fuliang Weng, Bingqing Wang, and
Yang Liu. 2011. Insertion, deletion, or substi-
tution?: normalizing text messages without
pre-categorization nor supervision. In Pro-
ceedings of the 49th Annual Meeting of the
Association for Computational Linguistics:
Human Language Technologies: short pa-
pers-Volume 2, pages 71–76. Association for
Computational Linguistics.
James Martens and Ilya Sutskever. 2011. Learn-
ing recurrent neural networks with hessian-
free optimization. In Proceedings of the 28th
International Conference on Machine Learn-
ing (ICML-11), pages 1033–1040.
Gregoire Mesnil, Xiaodong He, Li Deng, and
Yoshua Bengio. 2013. Investigation of recur-
rent-neural-network architectures and learning
methods for spoken language understanding.
In Proceedings of the 14th Annual Conference
of the International Speech Communication
Association, pages 3771–3775.
Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg
S. Corrado, and Jeff Dean. 2013. Distributed
representations of words and phrases and their
compositionality. In Advances in Neural In-
formation Processing Systems, pages 3111–
3119.
Olutobi Owoputi, Brendan O&apos;Connor, Chris Dy-
er, Kevin Gimpel, Nathan Schneider, and No-
ah A. Smith. 2013. Improved Part-of-Speech
Tagging for Online Conversational Text with
Word Clusters. In Proceedings of the 2013
Conference of the North American Chapter of
the Association for Computational Linguis-
tics: Human Language Technologies, pages
380–390.
Jeffrey Pennington, Richard Socher, and Chris-
topher D. Manning. Glove: Global vectors for
word representation. 2014. In Proceedings of
the Empiricial Methods in Natural Language
Processing (EMNLP 2014).
Alan Ritter, Sam Clark, and Oren Etzioni.
Named entity recognition in tweets: an exper-
imental study. 2011. In Proceedings of the
Conference on Empirical Methods in Natural
Language Processing, pp. 1524–1534. Asso-
ciation for Computational Linguistics.
Richard Socher, Alex Perelygin, Jean Wu, Jason
Chuang, Christopher D. Manning, Andrew Y.
Ng, and Christopher Potts. 2013. Recursive
deep models for semantic compositionality
over a sentiment treebank. In Proceedings of
the 2013 Conference on Empirical Methods in
Natural Language Processing, pages 1631–
1642, Stroudsburg, PA, October. Association
for Computational Linguistics.
Nitish Srivastava, Geoffrey Hinton, Alex
Krizhevsky, Ilya Sutskever, and Ruslan Sala-
khutdinov. 2014. Dropout: A simple way to
prevent neural networks from overfitting. The
Journal of Machine Learning Research, 15
(1): 1929–1958.
Oscar TŠckstršm, Dipanjan Das, Slav Petrov,
Ryan McDonald, and Joakim Nivre. 2013.
Token and type constraints for cross-lingual
part-of-speech tagging. Transactions of the
Association for Computational Linguistics, 1:
1–12.
Pascal Vincent, Hugo Larochelle, Isabelle La-
joie, Yoshua Bengio, and Pierre-Antoine
Manzagol. 2010. Stacked denoising autoen-
coders: Learning useful representations in a
deep network with a local denoising criterion.
</reference>
<page confidence="0.985047">
118
</page>
<reference confidence="0.989816444444444">
The Journal of Machine Learning Research,
11: 3371–3408.
David Yarowsky. 1997. Homograph disambigua-
tion in text-to-speech synthesis. In Progress in
speech synthesis, pages 157–172. Springer,
New York.
Matthew D. Zeiler. 2012. ADADELTA: an adap-
tive learning rate method. arXiv preprint
arXiv:1212.5701
</reference>
<page confidence="0.998861">
119
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.248158">
<title confidence="0.850869">NCSU_SAS_WOOKHEE: A Deep Long-Short Term Memory for Text Normalization</title>
<author confidence="0.504803">Wookhee Min Bradford W</author>
<affiliation confidence="0.755735">Center for Educational North Carolina State</affiliation>
<address confidence="0.974971">Raleigh, NC, USA</address>
<email confidence="0.999837">wmin@ncsu.edu</email>
<email confidence="0.999837">bwmott@ncsu.edu</email>
<abstract confidence="0.994069476190476">To address the challenges of normalizing online conversational texts prevalent in social media, we propose a contextual long-short term memory (LSTM) recurrent neural network based approach, augmented with a self-generated dictionary normalization technique. Our approach utilizes a sequence of characters as well as the part-of-speech associated with words without harnessing any external lexical resources. This work is evaluated on the English Tweet data set provided by the ACL 2015 W-NUT Normalization of Noisy Text shared task. The results, by achieving second place (F1 score: 81.75%) in the constrained track of the competition, indicate that the proposed LSTM-based approach is a promising tool for normalizing non-standard language.</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="true">
<authors>
<author>Timothy Baldwin</author>
<author>Marie Catherine de Marneffe</author>
<author>Bo Han</author>
<author>Young-Bum Kim</author>
<author>Alan Ritter</author>
<author>Wei Xu</author>
</authors>
<title>Shared Tasks of the 2015 Workshop on Noisy User-generated Text: Twitter Lexical Normalization and Named Entity Recognition.</title>
<date>2015</date>
<booktitle>In Proceedings of the Workshop on Noisy User-generated Text (WNUT 2015),</booktitle>
<location>Beijing, China.</location>
<marker>Baldwin, de Marneffe, Han, Kim, Ritter, Xu, 2015</marker>
<rawString>Timothy Baldwin, Marie Catherine de Marneffe, Bo Han, Young-Bum Kim, Alan Ritter, and Wei Xu. 2015. Shared Tasks of the 2015 Workshop on Noisy User-generated Text: Twitter Lexical Normalization and Named Entity Recognition. In Proceedings of the Workshop on Noisy User-generated Text (WNUT 2015), Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Frederic Bastien</author>
<author>Pascal Lamblin</author>
<author>Razvan Pascanu</author>
<author>James Bergstra</author>
<author>Ian Goodfellow</author>
<author>Arnaud Bergeron</author>
<author>Nicolas Bouchard</author>
<author>David Warde-Farley</author>
<author>Yoshua Bengio</author>
</authors>
<title>Theano: new features and speed improvements. arXiv preprint arXiv:1211.5590.</title>
<date>2012</date>
<contexts>
<context position="17497" citStr="Bastien et al., 2012" startWordPosition="2800" endWordPosition="2803">, where their representations are learned in the context of unsupervised language modeling (Mikolov et al., 2013; Pennington et al., 2014) or supervised tasks of interest (Mesnil et al., 2013). We choose the latter approach and learn character embeddings using a linear projection layer while training the text normalization LSTM model in a supervised manner. We set 114 Figure 2: An illustration of the LSTM-based text normalization model both the character and POS embedding size to 256 for this task based on preliminary analyses using a grid search. For our base code, we utilized a Theanobased (Bastien et al., 2012) LSTM implementation1 with a single-cell memory block per time, which was implemented targeting a sentiment analysis task on an IMDB data set. In this implementation, the input gate (𝑖t), forget gate (𝑓t), and candidate value of the memory content (𝑐t) at time t are computed by Equation (1), (2), and (3), respectively, in which W and U are weight matrices for the input (xt) at time t and the cell output (ht-1) at time t-1, b is the bias vector of each unit, and σ and tanh are the logistic sigmoid and hyperbolic tangent function, respectively: 𝑖t = σ(𝑊i𝑥t + 𝑈iℎt-1 + 𝑏i) 𝑓t = σ(𝑊f𝑥t + 𝑈fℎt-1 + 𝑏</context>
</contexts>
<marker>Bastien, Lamblin, Pascanu, Bergstra, Goodfellow, Bergeron, Bouchard, Warde-Farley, Bengio, 2012</marker>
<rawString>Frederic Bastien, Pascal Lamblin, Razvan Pascanu, James Bergstra, Ian Goodfellow, Arnaud Bergeron, Nicolas Bouchard, David Warde-Farley, and Yoshua Bengio. 2012. Theano: new features and speed improvements. arXiv preprint arXiv:1211.5590.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Grzegorz Chrupała</author>
</authors>
<title>Normalizing tweets with edit scripts and recurrent neural embeddings.</title>
<date>2014</date>
<booktitle>In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics,</booktitle>
<pages>680--686</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="1515" citStr="Chrupała, 2014" startWordPosition="221" endWordPosition="222">M-based approach is a promising tool for normalizing non-standard language. 1 Introduction Recent years have seen increasing use of online social media such as Twitter and Facebook that has generated a growing body of text where nonstandard language is prevalent. These nonstandard lexical items take many different forms, including unintentional errors based on users’ cognitive misconceptions and typographical errors, and intentional non-canonical language such as abbreviations, word lengthening by duplication of characters, Internet slang, phonetic substitutions, and creative use of language (Chrupała, 2014; Owoputi et al., 2013). A key challenge posed by these non-standard texts is the negative impact on traditional natural language processing (NLP) pipeline processes, evidenced by noticeable underperformance of their predictive accuracy in various domains such as part-of-speech tagging (Gimpel et al., 2011) and named entity recognition (Ritter et al., 2011) compared to more standard text. As an approach to addressing this challenge, text normalization techniques have been widely investigated, ranging from extracting domain specific lexical variants (Gouws et al., 2011), unified letter transfor</context>
<context position="4089" citStr="Chrupała, 2014" startWordPosition="600" endWordPosition="601">d in multi-layer neural network training. In this work, we leverage long-short term memory models (LSTMs) (Hochreiter and Schmidhuber, 1997; Graves, 2012), a variant of recurrent neural networks, to conduct text normalization on the data set given from the WNUT English lexical normalization shared task (Baldwin et al., 2015). We additionally harness the part-of-speech tagger created by Noah’s Ark research team (Owoputi et al., 2013), a free resource to the constrained task. Similar to Chrupała’s work that predicts Levenshtein edit operations between canonical and non-canonical forms of words (Chrupała, 2014), this proposed approach predicts word-level edit operations based on character-level inputs. The proposed approach is novel compared to previous work from four perspectives: (1) it utilizes LSTMs to predict the word-level edit operations, along with a dictionary induced from the training set, (2) it takes as input the surrounding words as well as the current word to capture contextual information of the predicted word, while any additional contextual information (e.g., part-of-speech tags) is treated as heading characters of the word, (3) character and part-of-speech embeddings are learned on</context>
<context position="10746" citStr="Chrupała, 2014" startWordPosition="1694" endWordPosition="1695">ng this technique with POS tags in the context of text normalization. The input encoding for predicting edit operations of the current word (Wordn) is described in Figure 1. To summarize, the number of inputs in a sequence is 3 + x + y + z, where x, y and z are the number of characters of the previous, current, and following word, respectively, while 3 is derived from the POS tags of all three words. Encoding the output is based on the Levenshtein distance algorithm (Levenshtein, 1966) that supports three operations: insert, replace, and delete, inspired by Chrupała’s text normalization work (Chrupała, 2014). In this work, we reformulate his approach to predict word-level edit operations instead of character-level edit operations, by which the model predicts a label for an individual token. In the character-level prediction, to correctly normalize a token, it requires all correction predictions on every character that belongs to a word (i.e., probabilities get multiplied), whereas the word-level prediction requires one prediction per token. Once a training sample is given, the Levenshtein distance algorithm calculates the required edit operations to convert the possibly non-standard word into the</context>
</contexts>
<marker>Chrupała, 2014</marker>
<rawString>Grzegorz Chrupała. Normalizing tweets with edit scripts and recurrent neural embeddings. 2014. In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics, pages 680–686. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="false">
<authors>
<author>Kevin Gimpel</author>
<author>Nathan Schneider</author>
<author>Brendan O&apos;Connor</author>
<author>Dipanjan Das</author>
<author>Daniel Mills</author>
<author>Jacob Eisenstein</author>
<author>Michael Heilman</author>
<author>Dani Yogatama</author>
<author>Jeffrey Flanigan</author>
<author>Noah A Smith</author>
</authors>
<title>Part-of-speech tagging for Twitter: Annotation, features, and experiments.</title>
<date>2011</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association</booktitle>
<pages>42--47</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="1823" citStr="Gimpel et al., 2011" startWordPosition="262" endWordPosition="265">any different forms, including unintentional errors based on users’ cognitive misconceptions and typographical errors, and intentional non-canonical language such as abbreviations, word lengthening by duplication of characters, Internet slang, phonetic substitutions, and creative use of language (Chrupała, 2014; Owoputi et al., 2013). A key challenge posed by these non-standard texts is the negative impact on traditional natural language processing (NLP) pipeline processes, evidenced by noticeable underperformance of their predictive accuracy in various domains such as part-of-speech tagging (Gimpel et al., 2011) and named entity recognition (Ritter et al., 2011) compared to more standard text. As an approach to addressing this challenge, text normalization techniques have been widely investigated, ranging from extracting domain specific lexical variants (Gouws et al., 2011), unified letter transformation (Liu et al., 2011), dictionary based methods using string substitution (Han et al., 2012) in an unsupervised manner, to character-level edit operation predictions utilizing conditional random fields in a supervised manner (Chrupała, 2014). Because language data consists of sequential information, suc</context>
</contexts>
<marker>Gimpel, Schneider, O&apos;Connor, Das, Mills, Eisenstein, Heilman, Yogatama, Flanigan, Smith, 2011</marker>
<rawString>Kevin Gimpel, Nathan Schneider, Brendan O&apos;Connor, Dipanjan Das, Daniel Mills, Jacob Eisenstein, Michael Heilman, Dani Yogatama, Jeffrey Flanigan, and Noah A. Smith. 2011. Part-of-speech tagging for Twitter: Annotation, features, and experiments. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies: short papers-Volume 2, pages 42–47. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Stephan Gouws</author>
<author>Dirk Hovy</author>
<author>Donald Metzler</author>
</authors>
<title>Unsupervised mining of lexical variants from noisy text.</title>
<date>2011</date>
<booktitle>In Proceedings of the First workshop on Unsupervised Learning in NLP,</booktitle>
<pages>82--90</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="2090" citStr="Gouws et al., 2011" startWordPosition="302" endWordPosition="305">and creative use of language (Chrupała, 2014; Owoputi et al., 2013). A key challenge posed by these non-standard texts is the negative impact on traditional natural language processing (NLP) pipeline processes, evidenced by noticeable underperformance of their predictive accuracy in various domains such as part-of-speech tagging (Gimpel et al., 2011) and named entity recognition (Ritter et al., 2011) compared to more standard text. As an approach to addressing this challenge, text normalization techniques have been widely investigated, ranging from extracting domain specific lexical variants (Gouws et al., 2011), unified letter transformation (Liu et al., 2011), dictionary based methods using string substitution (Han et al., 2012) in an unsupervised manner, to character-level edit operation predictions utilizing conditional random fields in a supervised manner (Chrupała, 2014). Because language data consists of sequential information, such as streams of characters and sequences of words, many NLP approaches leverage computational models that can effectively deal with temporal data, such as hidden Markov models and conditional random fields (TŠckstršm, 2013; Chrupała, 2014). More recently, deep learni</context>
</contexts>
<marker>Gouws, Hovy, Metzler, 2011</marker>
<rawString>Stephan Gouws, Dirk Hovy, and Donald Metzler. 2011. Unsupervised mining of lexical variants from noisy text. In Proceedings of the First workshop on Unsupervised Learning in NLP, pages 82–90. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Alex Graves</author>
</authors>
<title>Supervised sequence labelling with recurrent neural networks.</title>
<date>2012</date>
<publisher>Springer.</publisher>
<location>Heidelberg:</location>
<contexts>
<context position="3628" citStr="Graves, 2012" startWordPosition="527" endWordPosition="528">e with the emergence of effective training methods for deep networks, such as pretraining (Vincent et al., 2010) and optimization techniques (Zeiler, 2010; Martens and Sutskever, 2011) that significantly diminish problems associated with vanishing and exploding gradient that 111 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 111–119, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics are often observed in multi-layer neural network training. In this work, we leverage long-short term memory models (LSTMs) (Hochreiter and Schmidhuber, 1997; Graves, 2012), a variant of recurrent neural networks, to conduct text normalization on the data set given from the WNUT English lexical normalization shared task (Baldwin et al., 2015). We additionally harness the part-of-speech tagger created by Noah’s Ark research team (Owoputi et al., 2013), a free resource to the constrained task. Similar to Chrupała’s work that predicts Levenshtein edit operations between canonical and non-canonical forms of words (Chrupała, 2014), this proposed approach predicts word-level edit operations based on character-level inputs. The proposed approach is novel compared to pr</context>
<context position="15211" citStr="Graves, 2012" startWordPosition="2425" endWordPosition="2426">very three-word example and thus utilize all available contextual dependencies when ambiguous or out-ofvocabulary words appear in the test set. 2.3 Long-Short Term Memory (LSTM) for Text Normalization An LSTM (Hochreiter and Schmidhuber, 1997) is a variant of recurrent neural networks (RNNs) that is specifically designed for sequence labeling on temporal data. LSTM has been extended to have a longer term memory compared to traditional RNNs by introducing a memory block that features one or more self-connected memory cells along with three gating units: input gate, forget gate and output gate (Graves, 2012). Traditional RNNs often suffer from vanishing and exploding gradient problems when training deep networks using the backpropagation-throughtime method, and thus prevent RNNs from storing long-term dependencies from previous time steps in the sequential data. In LSTMs, the input and output gate modulate the incoming and outgoing signals on the memory cell, and the forget gate controls the previous state of the memory cell whether to remember or forget; this structure allows it to preserve gradient information over long periods of time, and thus effectively address vanishing/exploding gradients</context>
<context position="18978" citStr="Graves (2012)" startWordPosition="3070" endWordPosition="3071"> gate (𝑓t). Through this process, a memory cell decides whether to keep or forget the previous memory state and regulates the candidate of the current memory state via the input gate. This step is described in Equation (4): 1 http://deeplearning.net/tutorial/lstm.html 𝑐t = 𝑖t𝑐t + 𝑓t𝑐t-1 (4) In Equation (5), the output gate (𝑜t), similarly calculated as in Equation (1) and (2), is utilized to compute the cell activation (ℎt) of the LSTM block, based on the new memory state (𝑐t) (Equation 6): 𝑜f = σ(𝑊o𝑥t + 𝑈0ℎt-1 + 𝑏o) (5) ℎt = 𝑜f tanh(𝑐t) (6) In this model, as a variant of the LSTM proposed by Graves (2012), the input and forget gates do not take as input the previous memory cell’s state, and the output gate does not utilize the current memory cell’s state, to take advantage of a computational benefit when training models; rather, the current memory cell’s state is only utilized to calculate the cell’s output representation, along with the computed vector from the output gate (Equation 6). As illustrated in Figure 2, a character or POS is fed into the model at each time step, inducing a cell output (h) and a cell state (c). To predict the label (i.e., edit operations of a word), the model perfor</context>
</contexts>
<marker>Graves, 2012</marker>
<rawString>Alex Graves. 2012. Supervised sequence labelling with recurrent neural networks. Heidelberg: Springer.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Bo Han</author>
<author>Paul Cook</author>
<author>Timothy Baldwin</author>
</authors>
<title>Automatically constructing a normalisation dictionary for microblogs.</title>
<date>2012</date>
<booktitle>In Proceedings of the 2012 joint conference on empirical methods in natural language processing and computational natural language learning,</booktitle>
<pages>421--432</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="2211" citStr="Han et al., 2012" startWordPosition="321" endWordPosition="324">he negative impact on traditional natural language processing (NLP) pipeline processes, evidenced by noticeable underperformance of their predictive accuracy in various domains such as part-of-speech tagging (Gimpel et al., 2011) and named entity recognition (Ritter et al., 2011) compared to more standard text. As an approach to addressing this challenge, text normalization techniques have been widely investigated, ranging from extracting domain specific lexical variants (Gouws et al., 2011), unified letter transformation (Liu et al., 2011), dictionary based methods using string substitution (Han et al., 2012) in an unsupervised manner, to character-level edit operation predictions utilizing conditional random fields in a supervised manner (Chrupała, 2014). Because language data consists of sequential information, such as streams of characters and sequences of words, many NLP approaches leverage computational models that can effectively deal with temporal data, such as hidden Markov models and conditional random fields (TŠckstršm, 2013; Chrupała, 2014). More recently, deep learning models (e.g., multi-layer feedforward neural networks, recurrent neural networks, recursive neural networks) have been</context>
</contexts>
<marker>Han, Cook, Baldwin, 2012</marker>
<rawString>Bo Han, Paul Cook, and Timothy Baldwin. 2012. Automatically constructing a normalisation dictionary for microblogs. In Proceedings of the 2012 joint conference on empirical methods in natural language processing and computational natural language learning, pages 421–432. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Geoffrey Hinton</author>
<author>Li Deng</author>
<author>Dong Yu</author>
<author>George E Dahl</author>
<author>Abdel-rahman Mohamed</author>
<author>Navdeep Jaitly</author>
<author>Andrew Senior</author>
</authors>
<title>Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups. Signal Processing Magazine,</title>
<date>2012</date>
<journal>IEEE,</journal>
<volume>29</volume>
<issue>6</issue>
<pages>82--97</pages>
<contexts>
<context position="2921" citStr="Hinton et al., 2012" startWordPosition="423" endWordPosition="427">ional random fields in a supervised manner (Chrupała, 2014). Because language data consists of sequential information, such as streams of characters and sequences of words, many NLP approaches leverage computational models that can effectively deal with temporal data, such as hidden Markov models and conditional random fields (TŠckstršm, 2013; Chrupała, 2014). More recently, deep learning models (e.g., multi-layer feedforward neural networks, recurrent neural networks, recursive neural networks) have been used in NLP to achieve state-of-the-art performance in areas such as speech recognition (Hinton et al., 2012) and sentiment analysis (Socher et al., 2013). The success of deep learning has been attainable with the emergence of effective training methods for deep networks, such as pretraining (Vincent et al., 2010) and optimization techniques (Zeiler, 2010; Martens and Sutskever, 2011) that significantly diminish problems associated with vanishing and exploding gradient that 111 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 111–119, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics are often observed in multi-layer neural network training. In th</context>
</contexts>
<marker>Hinton, Deng, Yu, Dahl, Mohamed, Jaitly, Senior, 2012</marker>
<rawString>Geoffrey Hinton, Li Deng, Dong Yu, George E. Dahl, Abdel-rahman Mohamed, Navdeep Jaitly, Andrew Senior et al. 2012. Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups. Signal Processing Magazine, IEEE, 29(6): 82–97.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Sepp Hochreiter</author>
<author>Jürgen Schmidhuber</author>
</authors>
<title>Long short-term memory.</title>
<date>1997</date>
<journal>Neural computation,</journal>
<volume>9</volume>
<issue>8</issue>
<pages>1735--1780</pages>
<contexts>
<context position="3613" citStr="Hochreiter and Schmidhuber, 1997" startWordPosition="523" endWordPosition="526">f deep learning has been attainable with the emergence of effective training methods for deep networks, such as pretraining (Vincent et al., 2010) and optimization techniques (Zeiler, 2010; Martens and Sutskever, 2011) that significantly diminish problems associated with vanishing and exploding gradient that 111 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 111–119, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics are often observed in multi-layer neural network training. In this work, we leverage long-short term memory models (LSTMs) (Hochreiter and Schmidhuber, 1997; Graves, 2012), a variant of recurrent neural networks, to conduct text normalization on the data set given from the WNUT English lexical normalization shared task (Baldwin et al., 2015). We additionally harness the part-of-speech tagger created by Noah’s Ark research team (Owoputi et al., 2013), a free resource to the constrained task. Similar to Chrupała’s work that predicts Levenshtein edit operations between canonical and non-canonical forms of words (Chrupała, 2014), this proposed approach predicts word-level edit operations based on character-level inputs. The proposed approach is novel</context>
<context position="14841" citStr="Hochreiter and Schmidhuber, 1997" startWordPosition="2362" endWordPosition="2365">edit operations of nothing). As a result, the total number of labels obtained form the training set is reduced from 706 to 694. Training examples for LSTMs are built upon all words except for hash-tags, at-mentions, and URLs that are filtered in the first step, regardless of whether a word is ambiguous or unique. In this manner, we expect that LSTMs can capture context information from every three-word example and thus utilize all available contextual dependencies when ambiguous or out-ofvocabulary words appear in the test set. 2.3 Long-Short Term Memory (LSTM) for Text Normalization An LSTM (Hochreiter and Schmidhuber, 1997) is a variant of recurrent neural networks (RNNs) that is specifically designed for sequence labeling on temporal data. LSTM has been extended to have a longer term memory compared to traditional RNNs by introducing a memory block that features one or more self-connected memory cells along with three gating units: input gate, forget gate and output gate (Graves, 2012). Traditional RNNs often suffer from vanishing and exploding gradient problems when training deep networks using the backpropagation-throughtime method, and thus prevent RNNs from storing long-term dependencies from previous time </context>
</contexts>
<marker>Hochreiter, Schmidhuber, 1997</marker>
<rawString>Sepp Hochreiter, and Jürgen Schmidhuber. 1997. Long short-term memory. Neural computation, 9(8): 1735–1780.</rawString>
</citation>
<citation valid="true">
<authors>
<author>V I Levenshtein</author>
</authors>
<title>Binary codes capable of correcting deletions, insertions, and reversals. Soviet physics doklady,</title>
<date>1966</date>
<volume>10</volume>
<issue>8</issue>
<pages>707--710</pages>
<contexts>
<context position="10621" citStr="Levenshtein, 1966" startWordPosition="1677" endWordPosition="1678">te that leveraging POSs about words is extendable to utilize any other metainformation, and we examine the feasibility of applying this technique with POS tags in the context of text normalization. The input encoding for predicting edit operations of the current word (Wordn) is described in Figure 1. To summarize, the number of inputs in a sequence is 3 + x + y + z, where x, y and z are the number of characters of the previous, current, and following word, respectively, while 3 is derived from the POS tags of all three words. Encoding the output is based on the Levenshtein distance algorithm (Levenshtein, 1966) that supports three operations: insert, replace, and delete, inspired by Chrupała’s text normalization work (Chrupała, 2014). In this work, we reformulate his approach to predict word-level edit operations instead of character-level edit operations, by which the model predicts a label for an individual token. In the character-level prediction, to correctly normalize a token, it requires all correction predictions on every character that belongs to a word (i.e., probabilities get multiplied), whereas the word-level prediction requires one prediction per token. Once a training sample is given, </context>
</contexts>
<marker>Levenshtein, 1966</marker>
<rawString>V. I. Levenshtein. 1966. Binary codes capable of correcting deletions, insertions, and reversals. Soviet physics doklady, 10(8): 707–710.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Fei Liu</author>
<author>Fuliang Weng</author>
<author>Bingqing Wang</author>
<author>Yang Liu</author>
</authors>
<title>Insertion, deletion, or substitution?: normalizing text messages without pre-categorization nor supervision.</title>
<date>2011</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies: short papers-Volume 2,</booktitle>
<pages>71--76</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="2140" citStr="Liu et al., 2011" startWordPosition="310" endWordPosition="313">i et al., 2013). A key challenge posed by these non-standard texts is the negative impact on traditional natural language processing (NLP) pipeline processes, evidenced by noticeable underperformance of their predictive accuracy in various domains such as part-of-speech tagging (Gimpel et al., 2011) and named entity recognition (Ritter et al., 2011) compared to more standard text. As an approach to addressing this challenge, text normalization techniques have been widely investigated, ranging from extracting domain specific lexical variants (Gouws et al., 2011), unified letter transformation (Liu et al., 2011), dictionary based methods using string substitution (Han et al., 2012) in an unsupervised manner, to character-level edit operation predictions utilizing conditional random fields in a supervised manner (Chrupała, 2014). Because language data consists of sequential information, such as streams of characters and sequences of words, many NLP approaches leverage computational models that can effectively deal with temporal data, such as hidden Markov models and conditional random fields (TŠckstršm, 2013; Chrupała, 2014). More recently, deep learning models (e.g., multi-layer feedforward neural ne</context>
</contexts>
<marker>Liu, Weng, Wang, Liu, 2011</marker>
<rawString>Fei Liu, Fuliang Weng, Bingqing Wang, and Yang Liu. 2011. Insertion, deletion, or substitution?: normalizing text messages without pre-categorization nor supervision. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies: short papers-Volume 2, pages 71–76. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>James Martens</author>
<author>Ilya Sutskever</author>
</authors>
<title>Learning recurrent neural networks with hessianfree optimization.</title>
<date>2011</date>
<booktitle>In Proceedings of the 28th International Conference on Machine Learning (ICML-11),</booktitle>
<pages>1033--1040</pages>
<contexts>
<context position="3199" citStr="Martens and Sutskever, 2011" startWordPosition="466" endWordPosition="469">uch as hidden Markov models and conditional random fields (TŠckstršm, 2013; Chrupała, 2014). More recently, deep learning models (e.g., multi-layer feedforward neural networks, recurrent neural networks, recursive neural networks) have been used in NLP to achieve state-of-the-art performance in areas such as speech recognition (Hinton et al., 2012) and sentiment analysis (Socher et al., 2013). The success of deep learning has been attainable with the emergence of effective training methods for deep networks, such as pretraining (Vincent et al., 2010) and optimization techniques (Zeiler, 2010; Martens and Sutskever, 2011) that significantly diminish problems associated with vanishing and exploding gradient that 111 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 111–119, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics are often observed in multi-layer neural network training. In this work, we leverage long-short term memory models (LSTMs) (Hochreiter and Schmidhuber, 1997; Graves, 2012), a variant of recurrent neural networks, to conduct text normalization on the data set given from the WNUT English lexical normalization shared task (Baldwin et al., 2015</context>
</contexts>
<marker>Martens, Sutskever, 2011</marker>
<rawString>James Martens and Ilya Sutskever. 2011. Learning recurrent neural networks with hessianfree optimization. In Proceedings of the 28th International Conference on Machine Learning (ICML-11), pages 1033–1040.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Gregoire Mesnil</author>
<author>Xiaodong He</author>
<author>Li Deng</author>
<author>Yoshua Bengio</author>
</authors>
<title>Investigation of recurrent-neural-network architectures and learning methods for spoken language understanding.</title>
<date>2013</date>
<booktitle>In Proceedings of the 14th Annual Conference of the International Speech Communication Association,</booktitle>
<pages>3771--3775</pages>
<contexts>
<context position="17068" citStr="Mesnil et al., 2013" startWordPosition="2728" endWordPosition="2731">a information for the associated word. When a deep learning model takes words or characters as input, an approach to obtaining their representation is using one-hot-encoding, which is a bit vector whose length is the size of the vocabulary of words or characters, where only the associated word/character bit is on (i.e., 1) while all other bits are off (i.e., 0). Another popular approach is utilizing word/character embeddings, where their representations are learned in the context of unsupervised language modeling (Mikolov et al., 2013; Pennington et al., 2014) or supervised tasks of interest (Mesnil et al., 2013). We choose the latter approach and learn character embeddings using a linear projection layer while training the text normalization LSTM model in a supervised manner. We set 114 Figure 2: An illustration of the LSTM-based text normalization model both the character and POS embedding size to 256 for this task based on preliminary analyses using a grid search. For our base code, we utilized a Theanobased (Bastien et al., 2012) LSTM implementation1 with a single-cell memory block per time, which was implemented targeting a sentiment analysis task on an IMDB data set. In this implementation, the </context>
</contexts>
<marker>Mesnil, He, Deng, Bengio, 2013</marker>
<rawString>Gregoire Mesnil, Xiaodong He, Li Deng, and Yoshua Bengio. 2013. Investigation of recurrent-neural-network architectures and learning methods for spoken language understanding. In Proceedings of the 14th Annual Conference of the International Speech Communication Association, pages 3771–3775.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Tomas Mikolov</author>
<author>Ilya Sutskever</author>
<author>Kai Chen</author>
<author>Greg S Corrado</author>
<author>Jeff Dean</author>
</authors>
<title>Distributed representations of words and phrases and their compositionality.</title>
<date>2013</date>
<booktitle>In Advances in Neural Information Processing Systems,</booktitle>
<pages>3111--3119</pages>
<contexts>
<context position="16988" citStr="Mikolov et al., 2013" startWordPosition="2715" endWordPosition="2718">first character of that word, regarded as a heading character that provides extra information for the associated word. When a deep learning model takes words or characters as input, an approach to obtaining their representation is using one-hot-encoding, which is a bit vector whose length is the size of the vocabulary of words or characters, where only the associated word/character bit is on (i.e., 1) while all other bits are off (i.e., 0). Another popular approach is utilizing word/character embeddings, where their representations are learned in the context of unsupervised language modeling (Mikolov et al., 2013; Pennington et al., 2014) or supervised tasks of interest (Mesnil et al., 2013). We choose the latter approach and learn character embeddings using a linear projection layer while training the text normalization LSTM model in a supervised manner. We set 114 Figure 2: An illustration of the LSTM-based text normalization model both the character and POS embedding size to 256 for this task based on preliminary analyses using a grid search. For our base code, we utilized a Theanobased (Bastien et al., 2012) LSTM implementation1 with a single-cell memory block per time, which was implemented targe</context>
</contexts>
<marker>Mikolov, Sutskever, Chen, Corrado, Dean, 2013</marker>
<rawString>Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg S. Corrado, and Jeff Dean. 2013. Distributed representations of words and phrases and their compositionality. In Advances in Neural Information Processing Systems, pages 3111– 3119.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Olutobi Owoputi</author>
<author>Brendan O&apos;Connor</author>
<author>Chris Dyer</author>
<author>Kevin Gimpel</author>
<author>Nathan Schneider</author>
<author>Noah A Smith</author>
</authors>
<title>Improved Part-of-Speech Tagging for Online Conversational Text with Word Clusters.</title>
<date>2013</date>
<booktitle>In Proceedings of the 2013 Conference of the North American Chapter of</booktitle>
<pages>380--390</pages>
<contexts>
<context position="1538" citStr="Owoputi et al., 2013" startWordPosition="223" endWordPosition="226"> is a promising tool for normalizing non-standard language. 1 Introduction Recent years have seen increasing use of online social media such as Twitter and Facebook that has generated a growing body of text where nonstandard language is prevalent. These nonstandard lexical items take many different forms, including unintentional errors based on users’ cognitive misconceptions and typographical errors, and intentional non-canonical language such as abbreviations, word lengthening by duplication of characters, Internet slang, phonetic substitutions, and creative use of language (Chrupała, 2014; Owoputi et al., 2013). A key challenge posed by these non-standard texts is the negative impact on traditional natural language processing (NLP) pipeline processes, evidenced by noticeable underperformance of their predictive accuracy in various domains such as part-of-speech tagging (Gimpel et al., 2011) and named entity recognition (Ritter et al., 2011) compared to more standard text. As an approach to addressing this challenge, text normalization techniques have been widely investigated, ranging from extracting domain specific lexical variants (Gouws et al., 2011), unified letter transformation (Liu et al., 201</context>
<context position="3910" citStr="Owoputi et al., 2013" startWordPosition="570" endWordPosition="573">t 111 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 111–119, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics are often observed in multi-layer neural network training. In this work, we leverage long-short term memory models (LSTMs) (Hochreiter and Schmidhuber, 1997; Graves, 2012), a variant of recurrent neural networks, to conduct text normalization on the data set given from the WNUT English lexical normalization shared task (Baldwin et al., 2015). We additionally harness the part-of-speech tagger created by Noah’s Ark research team (Owoputi et al., 2013), a free resource to the constrained task. Similar to Chrupała’s work that predicts Levenshtein edit operations between canonical and non-canonical forms of words (Chrupała, 2014), this proposed approach predicts word-level edit operations based on character-level inputs. The proposed approach is novel compared to previous work from four perspectives: (1) it utilizes LSTMs to predict the word-level edit operations, along with a dictionary induced from the training set, (2) it takes as input the surrounding words as well as the current word to capture contextual information of the predicted wor</context>
<context position="9774" citStr="Owoputi et al., 2013" startWordPosition="1525" endWordPosition="1528">n a tweet), a padding character is assigned for the previous or following word to have a consistent format. In this work, we additionally consider a word’s part-of-speech (POS) as extra input to the model, as previous literature (e.g., Yarowsky, 1997; TŠckstršm, 2013) indicates POS tags can improve performance in other natural language processing tasks, such as textto-speech synthesis and NLP parsers. We use an off-the-shelf POS tagger that features Brown clustering: the CMU Twitter Part-of-Speech Tagger, which achieves a state-of-the-art tagging result of 93% on a Twitter benchmark data set (Owoputi et al., 2013). The extracted POS information is added as a distinct heading character to each word, so that they are leveraged in the LSTM models. Similar to the character padding, we apply a POS padding for missing previous or next words. Note that leveraging POSs about words is extendable to utilize any other metainformation, and we examine the feasibility of applying this technique with POS tags in the context of text normalization. The input encoding for predicting edit operations of the current word (Wordn) is described in Figure 1. To summarize, the number of inputs in a sequence is 3 + x + y + z, wh</context>
</contexts>
<marker>Owoputi, O&apos;Connor, Dyer, Gimpel, Schneider, Smith, 2013</marker>
<rawString>Olutobi Owoputi, Brendan O&apos;Connor, Chris Dyer, Kevin Gimpel, Nathan Schneider, and Noah A. Smith. 2013. Improved Part-of-Speech Tagging for Online Conversational Text with Word Clusters. In Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pages 380–390.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Jeffrey Pennington</author>
<author>Richard Socher</author>
<author>Christopher D Manning</author>
</authors>
<title>Glove: Global vectors for word representation.</title>
<date>2014</date>
<booktitle>In Proceedings of the Empiricial Methods in Natural Language Processing (EMNLP</booktitle>
<contexts>
<context position="17014" citStr="Pennington et al., 2014" startWordPosition="2719" endWordPosition="2722">t word, regarded as a heading character that provides extra information for the associated word. When a deep learning model takes words or characters as input, an approach to obtaining their representation is using one-hot-encoding, which is a bit vector whose length is the size of the vocabulary of words or characters, where only the associated word/character bit is on (i.e., 1) while all other bits are off (i.e., 0). Another popular approach is utilizing word/character embeddings, where their representations are learned in the context of unsupervised language modeling (Mikolov et al., 2013; Pennington et al., 2014) or supervised tasks of interest (Mesnil et al., 2013). We choose the latter approach and learn character embeddings using a linear projection layer while training the text normalization LSTM model in a supervised manner. We set 114 Figure 2: An illustration of the LSTM-based text normalization model both the character and POS embedding size to 256 for this task based on preliminary analyses using a grid search. For our base code, we utilized a Theanobased (Bastien et al., 2012) LSTM implementation1 with a single-cell memory block per time, which was implemented targeting a sentiment analysis </context>
</contexts>
<marker>Pennington, Socher, Manning, 2014</marker>
<rawString>Jeffrey Pennington, Richard Socher, and Christopher D. Manning. Glove: Global vectors for word representation. 2014. In Proceedings of the Empiricial Methods in Natural Language Processing (EMNLP 2014).</rawString>
</citation>
<citation valid="true">
<authors>
<author>Alan Ritter</author>
<author>Sam Clark</author>
<author>Oren Etzioni</author>
</authors>
<title>Named entity recognition in tweets: an experimental study.</title>
<date>2011</date>
<booktitle>In Proceedings of the Conference on Empirical Methods in Natural Language Processing,</booktitle>
<pages>1524--1534</pages>
<institution>Association for Computational Linguistics.</institution>
<contexts>
<context position="1874" citStr="Ritter et al., 2011" startWordPosition="270" endWordPosition="273"> based on users’ cognitive misconceptions and typographical errors, and intentional non-canonical language such as abbreviations, word lengthening by duplication of characters, Internet slang, phonetic substitutions, and creative use of language (Chrupała, 2014; Owoputi et al., 2013). A key challenge posed by these non-standard texts is the negative impact on traditional natural language processing (NLP) pipeline processes, evidenced by noticeable underperformance of their predictive accuracy in various domains such as part-of-speech tagging (Gimpel et al., 2011) and named entity recognition (Ritter et al., 2011) compared to more standard text. As an approach to addressing this challenge, text normalization techniques have been widely investigated, ranging from extracting domain specific lexical variants (Gouws et al., 2011), unified letter transformation (Liu et al., 2011), dictionary based methods using string substitution (Han et al., 2012) in an unsupervised manner, to character-level edit operation predictions utilizing conditional random fields in a supervised manner (Chrupała, 2014). Because language data consists of sequential information, such as streams of characters and sequences of words, </context>
</contexts>
<marker>Ritter, Clark, Etzioni, 2011</marker>
<rawString>Alan Ritter, Sam Clark, and Oren Etzioni. Named entity recognition in tweets: an experimental study. 2011. In Proceedings of the Conference on Empirical Methods in Natural Language Processing, pp. 1524–1534. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Richard Socher</author>
<author>Alex Perelygin</author>
<author>Jean Wu</author>
<author>Jason Chuang</author>
<author>Christopher D Manning</author>
<author>Andrew Y Ng</author>
<author>Christopher Potts</author>
</authors>
<title>Recursive deep models for semantic compositionality over a sentiment treebank.</title>
<date>2013</date>
<booktitle>In Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing,</booktitle>
<pages>1631--1642</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA,</location>
<contexts>
<context position="2966" citStr="Socher et al., 2013" startWordPosition="431" endWordPosition="434">hrupała, 2014). Because language data consists of sequential information, such as streams of characters and sequences of words, many NLP approaches leverage computational models that can effectively deal with temporal data, such as hidden Markov models and conditional random fields (TŠckstršm, 2013; Chrupała, 2014). More recently, deep learning models (e.g., multi-layer feedforward neural networks, recurrent neural networks, recursive neural networks) have been used in NLP to achieve state-of-the-art performance in areas such as speech recognition (Hinton et al., 2012) and sentiment analysis (Socher et al., 2013). The success of deep learning has been attainable with the emergence of effective training methods for deep networks, such as pretraining (Vincent et al., 2010) and optimization techniques (Zeiler, 2010; Martens and Sutskever, 2011) that significantly diminish problems associated with vanishing and exploding gradient that 111 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 111–119, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics are often observed in multi-layer neural network training. In this work, we leverage long-short term memory m</context>
</contexts>
<marker>Socher, Perelygin, Wu, Chuang, Manning, Ng, Potts, 2013</marker>
<rawString>Richard Socher, Alex Perelygin, Jean Wu, Jason Chuang, Christopher D. Manning, Andrew Y. Ng, and Christopher Potts. 2013. Recursive deep models for semantic compositionality over a sentiment treebank. In Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing, pages 1631– 1642, Stroudsburg, PA, October. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Nitish Srivastava</author>
<author>Geoffrey Hinton</author>
<author>Alex Krizhevsky</author>
<author>Ilya Sutskever</author>
<author>Ruslan Salakhutdinov</author>
</authors>
<title>Dropout: A simple way to prevent neural networks from overfitting.</title>
<date>2014</date>
<journal>The Journal of Machine Learning Research,</journal>
<volume>15</volume>
<issue>1</issue>
<pages>1929--1958</pages>
<contexts>
<context position="21062" citStr="Srivastava et al., 2014" startWordPosition="3395" endWordPosition="3398">d with dictionary normalization. Precision Non-contextual Model F1 Precision Contextual Model F1 Recall Recall Fold 1 0.9032 0.6838 0.7783 0.8803 0.7185 0.7912 Fold 2 0.8776 0.7419 0.8041 0.9134 0.7232 0.8072 Fold 3 0.8988 0.6704 0.7680 0.8797 0.6805 0.7674 Fold 4 0.9209 0.6961 0.7929 0.9107 0.6986 0.7907 Fold 5 0.8589 0.7387 0.7943 0.8859 0.7347 0.8032 Averaged score 0.8919 0.7062 0.7875 0.8940 0.7111 0.7919 Table 2: 5-fold cross validation results of LSTMs: non-contextual model vs. contextual model. For other parameter settings in this experiment, we used 256 hidden units, 25% dropout rate (Srivastava et al., 2014), ADADELTA (Zeiler, 2012) for the network optimization, negative log-likelihood for the cost function, and mini-batch based gradient descent with the batch size set to 16. To avoid overfitting, we set aside a separate validation set, and let the training process repeat until there is no progress within the last ten iterations in terms of performance on the validation set. 3 Empirical Evaluation Before submitting our test set result to the W-NUT English lexical normalization shared task, we ran a 5-fold cross validation on the training set to evaluate the proposed approach. To conduct the exper</context>
</contexts>
<marker>Srivastava, Hinton, Krizhevsky, Sutskever, Salakhutdinov, 2014</marker>
<rawString>Nitish Srivastava, Geoffrey Hinton, Alex Krizhevsky, Ilya Sutskever, and Ruslan Salakhutdinov. 2014. Dropout: A simple way to prevent neural networks from overfitting. The Journal of Machine Learning Research, 15 (1): 1929–1958.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Oscar TŠckstršm</author>
<author>Dipanjan Das</author>
<author>Slav Petrov</author>
<author>Ryan McDonald</author>
<author>Joakim Nivre</author>
</authors>
<title>Token and type constraints for cross-lingual part-of-speech tagging.</title>
<date>2013</date>
<journal>Transactions of the Association for Computational Linguistics,</journal>
<volume>1</volume>
<pages>1--12</pages>
<marker>TŠckstršm, Das, Petrov, McDonald, Nivre, 2013</marker>
<rawString>Oscar TŠckstršm, Dipanjan Das, Slav Petrov, Ryan McDonald, and Joakim Nivre. 2013. Token and type constraints for cross-lingual part-of-speech tagging. Transactions of the Association for Computational Linguistics, 1: 1–12.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Pascal Vincent</author>
<author>Hugo Larochelle</author>
<author>Isabelle Lajoie</author>
<author>Yoshua Bengio</author>
<author>Pierre-Antoine Manzagol</author>
</authors>
<title>Stacked denoising autoencoders: Learning useful representations in a deep network with a local denoising criterion.</title>
<date>2010</date>
<contexts>
<context position="3127" citStr="Vincent et al., 2010" startWordPosition="457" endWordPosition="460">putational models that can effectively deal with temporal data, such as hidden Markov models and conditional random fields (TŠckstršm, 2013; Chrupała, 2014). More recently, deep learning models (e.g., multi-layer feedforward neural networks, recurrent neural networks, recursive neural networks) have been used in NLP to achieve state-of-the-art performance in areas such as speech recognition (Hinton et al., 2012) and sentiment analysis (Socher et al., 2013). The success of deep learning has been attainable with the emergence of effective training methods for deep networks, such as pretraining (Vincent et al., 2010) and optimization techniques (Zeiler, 2010; Martens and Sutskever, 2011) that significantly diminish problems associated with vanishing and exploding gradient that 111 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 111–119, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics are often observed in multi-layer neural network training. In this work, we leverage long-short term memory models (LSTMs) (Hochreiter and Schmidhuber, 1997; Graves, 2012), a variant of recurrent neural networks, to conduct text normalization on the data set given from </context>
</contexts>
<marker>Vincent, Larochelle, Lajoie, Bengio, Manzagol, 2010</marker>
<rawString>Pascal Vincent, Hugo Larochelle, Isabelle Lajoie, Yoshua Bengio, and Pierre-Antoine Manzagol. 2010. Stacked denoising autoencoders: Learning useful representations in a deep network with a local denoising criterion.</rawString>
</citation>
<citation valid="false">
<journal>The Journal of Machine Learning Research,</journal>
<volume>11</volume>
<pages>3371--3408</pages>
<marker></marker>
<rawString>The Journal of Machine Learning Research, 11: 3371–3408.</rawString>
</citation>
<citation valid="true">
<authors>
<author>David Yarowsky</author>
</authors>
<title>Homograph disambiguation in text-to-speech synthesis.</title>
<date>1997</date>
<booktitle>In Progress in speech synthesis,</booktitle>
<pages>157--172</pages>
<publisher>Springer,</publisher>
<location>New York.</location>
<contexts>
<context position="9403" citStr="Yarowsky, 1997" startWordPosition="1471" endWordPosition="1472">t of sequential, lowercased characters that compose the previous word, current word, and following word. Each character is mapped to a unique index (0–66), since there are a total of 67 different characters in the training data after the preprocessing step described in 2.1. If the current word does not have a previous or next word (e.g., the first or last word in a tweet), a padding character is assigned for the previous or following word to have a consistent format. In this work, we additionally consider a word’s part-of-speech (POS) as extra input to the model, as previous literature (e.g., Yarowsky, 1997; TŠckstršm, 2013) indicates POS tags can improve performance in other natural language processing tasks, such as textto-speech synthesis and NLP parsers. We use an off-the-shelf POS tagger that features Brown clustering: the CMU Twitter Part-of-Speech Tagger, which achieves a state-of-the-art tagging result of 93% on a Twitter benchmark data set (Owoputi et al., 2013). The extracted POS information is added as a distinct heading character to each word, so that they are leveraged in the LSTM models. Similar to the character padding, we apply a POS padding for missing previous or next words. No</context>
</contexts>
<marker>Yarowsky, 1997</marker>
<rawString>David Yarowsky. 1997. Homograph disambiguation in text-to-speech synthesis. In Progress in speech synthesis, pages 157–172. Springer, New York.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Matthew D Zeiler</author>
</authors>
<title>ADADELTA: an adaptive learning rate method. arXiv preprint arXiv:1212.5701</title>
<date>2012</date>
<contexts>
<context position="21087" citStr="Zeiler, 2012" startWordPosition="3400" endWordPosition="3401">cision Non-contextual Model F1 Precision Contextual Model F1 Recall Recall Fold 1 0.9032 0.6838 0.7783 0.8803 0.7185 0.7912 Fold 2 0.8776 0.7419 0.8041 0.9134 0.7232 0.8072 Fold 3 0.8988 0.6704 0.7680 0.8797 0.6805 0.7674 Fold 4 0.9209 0.6961 0.7929 0.9107 0.6986 0.7907 Fold 5 0.8589 0.7387 0.7943 0.8859 0.7347 0.8032 Averaged score 0.8919 0.7062 0.7875 0.8940 0.7111 0.7919 Table 2: 5-fold cross validation results of LSTMs: non-contextual model vs. contextual model. For other parameter settings in this experiment, we used 256 hidden units, 25% dropout rate (Srivastava et al., 2014), ADADELTA (Zeiler, 2012) for the network optimization, negative log-likelihood for the cost function, and mini-batch based gradient descent with the batch size set to 16. To avoid overfitting, we set aside a separate validation set, and let the training process repeat until there is no progress within the last ten iterations in terms of performance on the validation set. 3 Empirical Evaluation Before submitting our test set result to the W-NUT English lexical normalization shared task, we ran a 5-fold cross validation on the training set to evaluate the proposed approach. To conduct the experiment, we split the train</context>
</contexts>
<marker>Zeiler, 2012</marker>
<rawString>Matthew D. Zeiler. 2012. ADADELTA: an adaptive learning rate method. arXiv preprint arXiv:1212.5701</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>