<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.000919">
<title confidence="0.99461">
Toward Tweets Normalization Using Maximum Entropy
</title>
<author confidence="0.972779">
Mohammad Arshi Saloot
</author>
<affiliation confidence="0.99629">
Department of Artificial
Intelligence, University of
</affiliation>
<address confidence="0.798889">
Malaya, 50603, Malaysia
</address>
<email confidence="0.999005">
phd_siamak@yahoo.com
</email>
<author confidence="0.981299">
Norisma Idris
</author>
<affiliation confidence="0.9974515">
Department of Artificial
Intelligence, University of
</affiliation>
<address confidence="0.798845">
Malaya, 50603, Malaysia
</address>
<email confidence="0.998786">
norisma@um.edu.my
</email>
<author confidence="0.99024">
Liyana Shuib
</author>
<affiliation confidence="0.999293">
Department of Information
System, University of Malaya,
</affiliation>
<address confidence="0.716287">
50603, Malaysia
</address>
<email confidence="0.998489">
liyanashuib@um.edu.my
</email>
<author confidence="0.998075">
Ram Gopal Raj
</author>
<affiliation confidence="0.999361">
Department of Artificial
Intelligence, University of
</affiliation>
<address confidence="0.600308">
Malaya, 50603, Malaysia
</address>
<email confidence="0.982109">
ramdr@um.edu.my
</email>
<note confidence="0.396852">
*Corresponding author
</note>
<sectionHeader confidence="0.982001" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.999953121212121">
The use of social network services and
microblogs, such as Twitter, has created
valuable text resources, which contain
extremely noisy text. Twitter messages
contain so much noise that it is difficult
to use them in natural language pro-
cessing tasks. This paper presents a new
approach using the maximum entropy
model for normalizing Tweets. The pro-
posed approach addresses words that are
unseen in the training phase. Although
the maximum entropy needs a training
dataset to adjust its parameters, the pro-
posed approach can normalize unseen da-
ta in the training set. The principle of
maximum entropy emphasizes incorpo-
rating the available features into a uni-
form model. First, we generate a set of
normalized candidates for each out-of-
vocabulary word based on lexical, pho-
nemic, and morphophonemic similarities.
Then, three different probability scores
are calculated for each candidate using
positional indexing, a dependency-based
frequency feature and a language model.
After the optimal values of the model pa-
rameters are obtained in a training phase,
the model can calculate the final proba-
bility value for candidates. The approach
achieved an 83.12 BLEU score in testing
using 2,000 Tweets. Our experimental re-
sults show that the maximum entropy ap-
proach significantly outperforms previ-
</bodyText>
<author confidence="0.864595">
AiTi Aw*
</author>
<affiliation confidence="0.7419835">
Institute for Infocomm Research (I2R),
A*STAR, Singapore
</affiliation>
<email confidence="0.891568">
aaiti@i2r.a-star.edu.sg
</email>
<bodyText confidence="0.95947">
ous well-known normalization approach-
es.
</bodyText>
<sectionHeader confidence="0.999238" genericHeader="keywords">
1 Introduction
</sectionHeader>
<bodyText confidence="0.999943870967742">
The advent of Web 2.0 and electronic communi-
cations has enabled the extensive creation and
dissemination of user-generated content (UGC).
The UGC collections provide invaluable data
sources in order to mine and extract beneficial
information and knowledge, while, at the same
time, resulting in less standardized language
(Clark &amp; Araki, 2011; Daugherty, Eastin, &amp;
Bright, 2008).
However, such content diverges from standard
writing conventions. As shown by experts
(Bieswanger, 2007; Thurlow &amp; Brown, 2003),
this divergence is due to the usage of a variety of
coding strategies, including digit phonemes (you
too — you2), phonetic transcriptions (you — u),
vowel drops (dinner — dnnr), misspellings (con-
venience — convineince), and missing or incor-
rect punctuation marks (If I were you, I&apos;d proba-
bly go. — If I were you Id probably go). These
alterations are due to three main parameters: 1)
The small allowance of characters, 2) the con-
straints of the small keypads, and 3) using UGC
in informal communications between friends and
relatives.
Whatever their causes, these alterations con-
siderably affect any standard natural language
processing (NLP) system, due to the presence of
many out of vocabulary (OOV) words, also
known as non-standard words (NSWs) and un-
known words. Therefore, a text normalization
process must be performed before any conven-
</bodyText>
<page confidence="0.992382">
19
</page>
<note confidence="0.988437">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 19–27,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<bodyText confidence="0.999927263157895">
tional NLP process is implemented (Sproat et al.,
2001). As defined by Liu, Weng, Wang, and Liu
(2011), “Text message normalization aims to
replace the non-standard tokens that carry signif-
icant meanings with the context-appropriate
standard words.”
This paper proposes a novel normalization ap-
proach for Twitter messages. Twitter is the most
popular microblogging service in the world for
news-casting, sharing thoughts, and staying in
touch with friends. Since its initial founding in
2006, it has gathered hundreds of millions of reg-
istered users. Tweets refer to messages sent on
Twitter, which is restricted to 140 characters, 20
characters less than the 160 allowed by SMS.
Because of this limitation, users have to tran-
scribe Tweets with as much brevity as possible.
The normalization bears a resemblance to
spelling correction. The ultimate goal of which is
the detection and correction of OOV words. The
spelling correction methods only focus on mis-
spelled words while normalization systems con-
sider all forms of OOV words, such as represent-
ing sounds phonetically (e.g. by the way — btw)
and shortened forms (e.g. university — uni).
Thus, normalization approaches should address a
higher volume of OOV words compared to
spelling correction approaches that lead to more
complexity.
To address this complexity, we use maximum
entropy (Berger, Pietra, &amp; Pietra, 1996; Och &amp;
Ney, 2002) for utilizing and incorporating more
probability functions. Our approach is based on
the hypothesis that integrating more probability
functions will boost the performance of the
method; however, the available information and
number of probability functions for (OOV word,
standard word) pairs are always limited. Maxi-
mum entropy (Maxent) provides a criterion for
integrating probability distributions based on
partial knowledge. The Maxent produces the
lowest biased estimation on the given infor-
mation, that is, it is maximally neutral regarding
missing information. When defining some un-
known events with a statistical model, we should
always select the one that has maximum entropy.
Although the Maxent has already been used in
the normalization sphere (e.g. Pennell and Liu
(2010) utilized Maxent to classify deletion-based
abbreviations), this paper explains how to em-
ploy Maxent for selecting the best-normalized
candidate.
We have developed a method that does not re-
quire annotated training data and it normalizes
unseen data. Most of the normalization ap-
proaches substantially depend on the manually
annotated data, while the labeled data is costly
and time consuming to prepare. We generate
normalized candidates for each detected OOV
based on lexical, phonemic, and morphophone-
mic variations. In addition, since our target da-
taset encompasses Twitter messages from Singa-
poreans and code-switching between Malay and
English is frequent in the dataset, a Malay-
English dictionary is utilized to generate candi-
dates for Malay words. Finally, maximum entro-
py presents a backbone to combine several con-
ditional probabilities of normalized candidates.
The remainder of this paper is organized as
follows: Section 2 gives a survey of different
approaches of normalizing noisy text. Section 3
describes the preprocessing stage. Section 4 il-
lustrates the candidate generation stage. The pro-
posed candidate selection method is demonstrat-
ed in Section 5. Finally, Section 6 concludes this
paper with a summary and future works.
</bodyText>
<sectionHeader confidence="0.999592" genericHeader="introduction">
2 Related work
</sectionHeader>
<bodyText confidence="0.999438838709678">
The normalization approaches can be categorized
into four groups. The first group is called statisti-
cal machine translation (SMT) paradigm that
addresses the normalization problem as a statisti-
cal machine translation task. This paradigm was
first introduced by Aw, Zhang, Xiao and Su
(2006) to normalize SMS text that translates a
source language (UGC) to a target language
(standard language). This paradigm has since
been re-examined, expanded and improved by
other researchers (Lopez Ludeña, San Segundo,
Montero, Barra Chicote, &amp; Lorenzo, 2012). For
example, Kaufmann and Kalita (2010) used the
SMT-like approach to normalize English Tweets.
To normalize SMS language, a supervised
noisy channel model was introduced by
Choudhury, Saraf, Jain, Sarkar, and Basu (2007)
that used a hidden Markov model (HMM). This
approach mimics the spell checking task that
tries to handle the normalization problem via
noisy channel models that study the UGC text as
a noisy version of standard language. This para-
digm has been scrutinized and enhanced by other
researchers (Liu et al., 2011; Xue, Yin, &amp;
Davison, 2011a). For example, Cook and
Stevenson (2009) modified this approach to de-
sign an unsupervised method using probabilistic
models for only three common abbreviation
types: stylistic variation, prefix clipping, and
subsequence abbreviation. In addition, Beaufort,
Roekhaut, Cougnon, and Fairon (2010) merged
</bodyText>
<page confidence="0.990213">
20
</page>
<bodyText confidence="0.999915063829787">
the SMT-like and the spell checking approaches
to normalize French SMSs.
The third group is the dictionary based nor-
malization approach, which is an easy-to-use and
fast solution. This approach requires a dictionary
whose entries are OOV and standard form pairs.
It has been proven that using a colloquial dic-
tionary can outperform some state-of-the-art and
complex approaches (Clark &amp; Araki, 2011;
Saloot, Idris, &amp; Mahmud, 2014). However, its
performance highly relies on the size of the dic-
tionary. Therefore, Han, Cook, and Baldwin
(2012) introduced a method to automatically
compile a large dictionary. To address the short-
comings of the dictionary approach, Oliva,
Serrano, Del Castillo, and Igesias (2013) intro-
duced a special Spanish phonetic dictionary, in
which each entry is formed by a coded consonant
string, vowels strings, and their positions in the
word, for normalizing Spanish SMS texts.
The fourth group resembles automatic speech
recognition (ASR) systems. This paradigm con-
sists of three steps: 1) converting the text to
strings of phonemes via letter-to-phone rules, 2)
converting the strings of phonemes to words via
pronunciation dictionaries, and 3) choosing the
most probable words. The ASR-like approach
has been merged with other approaches to boost
its performance. Kobus, Yvon, and Damnati
(2008) combined ASR-like and SMT-like ap-
proaches to normalize French SMSs. Lin,
Bilmes, Vergyri, and Kirchhoff (2007) used this
approach to detect OOV words in switchboard
data.
Han and Baldwin (2011) illustrated a lexical
method for normalizing Twitter messages. After
detecting OOVs, ill-formed words, and generat-
ing a set of candidates, the best candidate is se-
lected using a variety of metrics: lexical edit dis-
tance, phonemic edit distance, longest common
subsequence (LCS), affix substring, language
model, and dependency-based frequency fea-
tures. The method achieved a 93.4 BLEU score
in normalizing 549 English Tweets. This inspired
us to design a normalization method that has
three major stages: preprocessing, candidate gen-
eration, and candidate selection.
</bodyText>
<sectionHeader confidence="0.991832" genericHeader="method">
3 Preprocessing
</sectionHeader>
<bodyText confidence="0.999989606060606">
First, we perform some initial text refining on the
tweets. For example, consecutive whitespace
characters are trimmed to single whitespace, and
extra whitespaces are removed from the begin-
ning and end of Tweets. The initial stage of most
NLP tasks is the tokenization. Existing tokeniza-
tion methods can perform accurately when the
text is thoroughly clean, such as news feeds and
book datasets. For example, the PTB-Tokenizer
is a fast, deterministic, and efficient tokenization
method. On the other hand, UGC text demands
special methods due to irregularities in its
whitespaces and punctuation. As suggested by
Lopez Ludeña et al. (2012), we employ a
straightforward word separating method, which
performs tokenization based on whitespace char-
acters.
One of the most important primary steps in
unsupervised normalization systems is to detect
OOV words. Hanspell and GNU Aspell are two
well-known spell checker systems, however,
Aspell performance is more accurate on the
noisy text (Clark &amp; Araki, 2011). The Aspell
dictionary is utilized to distinguish between
OOV and standard English words. In addition,
we used seven regular expression rules, which
were introduced by Saloot, Idris, and Aw (2014).
This helps to detect proper nouns, email and
URL addresses, Twitter special symbols, and
digits. The potential errors in the OOV word de-
tection step would not affect the performance of
the normalization system since the detected OOV
word will be included in the candidate set.
</bodyText>
<sectionHeader confidence="0.978274" genericHeader="method">
4 Candidate generation
</sectionHeader>
<bodyText confidence="0.999984833333334">
For each given OOV word, a set of normalized
candidates is generated via four different mod-
ules. The first module executes a lexical candi-
date generation, which is extensively utilized in
spell checker systems. It calculates candidates
within a distance of T edit operations of the de-
tected OOV words. Han and Baldwin (2011)
stated that when T is less than or equal to two,
the level of recall is high enough. The edit dis-
tance is the number of applied edits in changing
one word to another. An edit could be a deletion,
transposition, alteration, or insertion. Studies in
spelling correction found that one lexical edit
distance covers 80% to 95% of errors, and two
lexical edit distances cover 98% of them. There-
fore, here we use lexical variations with less than
or equal to two edit distances.
For a word of length n characters, 54n + 25
combinations will be generated with one lexical
edit distance using four reshaping strategies: 1)
Deletion strategy eliminates characters in all pos-
sible positions (e.g. aer — er, ar, ae), which
generates n combinations. 2) Transposition strat-
egy switches two adjacent characters (e.g. aer —
</bodyText>
<page confidence="0.994676">
21
</page>
<bodyText confidence="0.999787611111111">
ear, are), which generates n − 1 combinations.
3) Alteration strategy substitutes each character
with all English alphabets (e.g. aer — ber, cer,
der, eer, fer, ger, her, etc.), which generates 26n
combinations. 4) Insertion strategy presumes that
a letter is dropped, thus adding all the alphabets
between characters (e.g. aer — aaer, baer, caer,
daer, eaer, faer, gaer, haer, etc.), which gener-
ates 26(n + 1) combinations. Finally, from the
achieved combinations, standard words will be
selected using the Aspell dictionary. However,
many OOV words in Twitter are quite far from
their target in term of edit distance especially in
terms of deletions and substitutions. Therefore,
we generated more candidates via three other
methods.
Similar to the speech recognition systems, the
second module generates candidates based on
phoneme sounds. First, grapheme to phoneme
conversion is performed using the Phonetisaurus
tool (Novak, Yang, Minematsu, &amp; Hirose, 2011).
Phonetisaurus is an open-source phonetizer that
is designed in the form of a weighted finite state
transducer (WFST). After selecting the 10 best
phoneme sequences, it looks up the phonemes in
a pronouncing dictionary – Carnegie Mellon
University (CMU) dictionary. The CMU is a ma-
chine-readable pronunciation dictionary that con-
tains over 134,000 words including OOV words
such as proper nouns and acronyms. Due to the
existence of a large number of OOV words in the
CMU dictionary, we filter out the OOVs using
the Aspell dictionary.
The third module, as proposed by Saloot,
Idris, and Aw (2014), is a combination of the two
previous modules. First, it lexically generates
candidates within one edit distance of the given
OOV word, and then sends the candidates to the
phoneme module. Since our testing dataset con-
sists of English Tweets posted by Singaporeans,
code-switching between Malay and English is
frequent in the text. Therefore, our last module
translates OOV words to English (if any). We
searched for the tokens in the Smith Malay-
English Dictionary (Smith &amp; Padi, 2006), and
inserted the meanings in the candidate set.
Table 1 displays the average number of gener-
ated candidates for each module. The lowest rate
is associated with the Malay dictionary module.
Two lexical edit operations generate the highest
number of candidates, which indicates the high-
est recall and lowest precision. The rank of com-
bination and phoneme modules are second and
third, respectively.
</bodyText>
<table confidence="0.999457142857143">
Io. module Average number of
candidates
Two lexical edit 70
distance
Combination 50
Phoneme 20
Malay dictionary 3
</table>
<tableCaption confidence="0.996106">
Table 1: The average number of generated can-
didates for five letter words.
</tableCaption>
<sectionHeader confidence="0.937309" genericHeader="method">
5 Candidate selection
</sectionHeader>
<bodyText confidence="0.999678157894737">
The main contribution of this work is to present a
novel candidate selection method. The candidate
selection stage consists of two steps: 1) assigning
a variety of probability scores to candidates, and
2) integrating probability scores to select the best
candidate. Our candidate selection method re-
quires a training dataset. The training and testing
datasets are collected from an extensive English
Twitter corpus posted by Singaporeans (Saloot,
Idris, Aw, &amp; Thorleuchter, 2014). Three linguis-
tic experts manually normalized 7,000 Tweets,
while using inter-normalization agreement as an
indicator. The experts were instructed to produce
a text that is as close to standard English as pos-
sible, but leaves the Twitter special symbols (e.g.
#topic and @username) as is. The dataset was
split into two parts: 5,000 messages for the train-
ing phase, and 2,000 messages for the testing
phase.
</bodyText>
<subsectionHeader confidence="0.996868">
5.1 Calculation of probability scores
</subsectionHeader>
<bodyText confidence="0.999902">
In order to select the most suitable candidates,
we calculate their conditional probability scores
using, positional indexing, a dependency-based
frequency feature, and a language model (LM).
Inspired by work on a normalization diction-
ary (Han et al., 2012), the first method to calcu-
late the probability score of the candidates is the
positional indexing, which is widely used in in-
formation retrieval systems. The positional in-
dexing deals with positional locations of term
occurrences inside documents. To compile a po-
sitional index dataset, a method illustrated in
Manning and Raghavan (2009) is applied on a
cleansed portion of our Twitter corpus. Table 2
refers to an example of our achieved positional
index dataset. Each Twitter message is consid-
ered as a single document, and, hence, a unique
document ID is assigned to each document. The
frequency value indicates the total number of
appearances of a word in a document. The posi-
tion values express the locations of the word in
the document.
</bodyText>
<page confidence="0.983879">
22
</page>
<table confidence="0.9998825">
Vocab Document ID. Frequency Position
have 1 2 4,9
4 3 5, 11, 18
are 5 1 2
12 2 2, 9
14 2 2, 11
</table>
<tableCaption confidence="0.913803">
Table 2: An example of the positional indexes
obtained.
</tableCaption>
<bodyText confidence="0.99997393877551">
A probability score is assigned to the normalized
candidate according to a comparison between the
position of the candidate and positional indexes
in the dataset. We look for the candidate in the
dataset where there is an occurrence of the can-
didate with its position index. After aggregating
the number of occurrences, we normalize it be-
tween 0.0 and 1.0.
The next probability calculation method is the
dependency-based frequency, which is an aug-
mentation of the previous method. Inspired by a
work on the lexical normalization of Tweets
(Han &amp; Baldwin, 2011), the noisy portion of our
training dataset is parsed to obtain a dependency
bank using our adapted version of the Stanford
dependency parser (Marneffe, MacCartney, &amp;
Manning, 2006). Since our aim is not to perform
actual dependency parsing, the dependency types
are not extracted. A cleansed corpus is not uti-
lized because the percentage of IV words is high
enough in the corpus, and in the probability-
measuring phase, OOV words are already detect-
ed. For example, from a sentence such as “I will
go to London by next week,” (next, go +3) is ob-
tained, indicating that next appears two words
after go. The aggregations of all the dependency
scores, which are called confidence scores, are
stored in the dependency bank. A five-gram de-
pendency bank is prepared without using a root
node (head-word), that is, the process is iterated
for all words in the sentence.
A probability score between 0.0 and 1.0 is as-
signed to each candidate. A relative position
score in the form of (candidate word, context
word, position) is calculated for each candidate
within a context window of two words on either
side. The obtained relative position of a candi-
date is compared with the existing confidence
score in the dependency bank.
The third method of probability measurement
calculates the probabilities based on a language
model. The cleansed part of our training dataset,
which consists of more than 55,000 words, is fed
into SRILM (Stolcke, 2002) to compile a bidirec-
tional trigram LM by employing the Kneser-Ney
smoothing algorithm. To calculate the probabil-
ity of each candidate, we used a beam search de-
coder through the Moses decoder (Koehn et al.,
2007).
</bodyText>
<subsectionHeader confidence="0.998781">
5.2 Selecting the most probable candidate
</subsectionHeader>
<bodyText confidence="0.999913">
Previous works on spelling correction and nor-
malization used the source channel model, which
is also known as the noisy channel model and
Naïve Bayes (Beaufort et al., 2010; Kernighan,
Church, &amp; Gale, 1990; Mays, Damerau, &amp;
Mercer, 1991; Toutanova &amp; Moore, 2002; Xue,
Yin, &amp; Davison, 2011b). In the noisy channel
approach, we observe the conversion of standard
words to noisy words in a training phase in order
to build a model. In the prediction phase, the de-
coder can select the most probable candidate
based on the obtained model. The candidate se-
lection is accomplished based on only two pa-
rameters: the LM and error model, which is
computed as follows:
</bodyText>
<equation confidence="0.974002">
G = arg max{ P(T  |O )}
= arg max
</equation>
<bodyText confidence="0.929854444444445">
Where T is a target word, 0 is an observed
word, fm (T, O) is a feature function, M is a
number of total feature functions, and X is a La-
grange multiplier of each function. In our case,
M equals three, in which f1 is the positional in-
dexing, f2 is the dependency-based frequency
feature, and f3 is the LM probability. The
Maxent requires X being determined in the train-
ing phase before the actual usage.
</bodyText>
<sectionHeader confidence="0.975116" genericHeader="evaluation">
6 Experimental results and discussion
</sectionHeader>
<bodyText confidence="0.999963777777778">
We evaluate our approach in terms of BLEU
score (Papineni, Roukos, Ward, &amp; Zhu, 2002),
since BLEU has become a well-known and ade-
quate evaluation metric in normalization studies
(Contractor, Faruquie, &amp; Subramaniam, 2010;
Schlippe, Zhu, Gebhardt, &amp; Schultz, 2010). The
achieved baseline for the testing dataset is 42.01
BLEU score, that is, the volume of similarity
between the testing text and the reference text
(manually normalized text) in term of BLEU
score.
In the training phase, we performed maximum
likelihood training (Papineni, Roukos, &amp; Ward,
1998; Streit &amp; Luginbuhl, 1994) for X1, X2 and X3
between 0.0 and 1.0. Figure 1 shows the toler-
ance of the performance while transition of X1
and X2 (when X3 is fixed to 1.0). Figure 1 depicts
that the value of performance achieves the high-
</bodyText>
<equation confidence="0.632274714285714">
(T, O)
Y
M
∑
Am
•fm
m =1
</equation>
<page confidence="0.990706">
23
</page>
<bodyText confidence="0.970475571428571">
est when the X1 and X2 are close to 0.63 and 0.9,
respectively. It is found that the best performance
is achieved by 0.6, 0.9, and 1.0 values for X1, X2,
and X3, respectively. This means that LM has the
highest impact on the candidate selection, and
that dependency-based frequency has a higher
impact on candidate selection than positional.
</bodyText>
<figure confidence="0.998263214285714">
0.9
0.8
0.7
0.6
0.5
0.4
0.3
0.2
0.1
0
1
Lambda 2
Lambda 1
Performance
</figure>
<figureCaption confidence="0.999998">
Figure 1: The training of Maxent for lambda settings.
</figureCaption>
<bodyText confidence="0.999937454545455">
We divided our dataset into six equal sets in or-
der to perform 6-fold cross validation. As shown
in Table 3, the average of the obtained BLEU
scores in six evaluation rounds was 83.12. The
evaluation proves that our approach boosts the
BLEU score by 41.11 (i.e. from 42.01 to 83.12).
Since previous normalization studies used differ-
ent data sources in their experiments, a direct
comparison between our accuracy values is not
meaningful. Therefore, we re-examined one of
the state-of-the-art approaches using our dataset.
</bodyText>
<table confidence="0.745153875">
6-fold cross validation BLEU score
Round 1 80.99
Round 2 81.57
Round 3 84.82
Round 4 83.91
Round 5 83.90
Round 6 83.55
Average 83.12
</table>
<tableCaption confidence="0.804587">
Table 3: Normalization results for 6-fold cross
validation test.
</tableCaption>
<bodyText confidence="0.999752166666667">
The statistical machine translation (SMT) is a
cutting-edge approach that handles the normali-
zation problem as a statistical machine transla-
tion task; it was first introduced by Aw, Zhang,
Xiao, and Su (2006). The SMT-like approach
translates a source language (UGC) to a target
language (standard language). The experiment
was performed using Moses (Koehn et al., 2007)
for statistical translation, Giza++ (Och &amp; Ney,
2003) for word alignment, and SRILM (Stolcke,
2002) for LM compiling. The SMT system is
trained using our Twitter aligned dataset. The
optimum results were achieved using a trigram
LM and Backoff smoothing (Jelinek, 1990):
78.81 BLEU score.
Table 4 indicates some statistics about our
testing dataset. The OOV words are those detect-
ed by our OOV detection module. The BLEU
score of raw text is an important measure to ana-
lyze the difficulty of the task. It is important to
note that the dataset used in our experiment con-
tains an above average number of OOV words
compared to the datasets in other related papers.
The dataset used by Kobus et al. (2008) consists
of 32% OOV words, which is slightly lower than
34% of our dataset. In addition, Aw et al. (2006)
used a dataset with a baseline BLEU score of
57.84, which indicates that the raw text is much
more similar to the manual translated text (refer-
ence text) than the ones used in our experiment.
</bodyText>
<table confidence="0.9570768">
Avg. length of words (character) 5
Avg. number of words 11
Total No. of tokens 19,759
OOV words 34.02%
BLEU score of raw text 42.01
</table>
<tableCaption confidence="0.998256">
Table 4: Statistics of testing dataset.
</tableCaption>
<bodyText confidence="0.9944594">
As shown in Table 4, the average length of
words is five characters, which makes the nor-
malization task more difficult. For example, the
candidate set for the OOV word “yoor” contains
59 words, as shown in Table 5. The large number
of candidates causes difficulty for candidate se-
lection because more options lead to more possi-
bilities and more computational cost. Further-
more, the generated candidates are lexically, syn-
tactically, and semantically very akin to each
other. For example, for the OOV word “yoor”,
“our” might be mistakenly selected instead of
“your”. There are a smaller number of potential
candidates for lengthy OOV words. As shown in
Table 5, the number of candidates for the OOV
</bodyText>
<page confidence="0.997038">
24
</page>
<bodyText confidence="0.968607104166667">
word “acessibility” is only 14, which is less than
average, thereby making candidate selection eas-
ier. Moreover, there is a distinct difference be-
tween the meanings of candidates, which is an
easy situation for our context-based probability
functions to select the correct one. Although our
approach obtained promising results on this da-
taset, it works better on long words.
OOV word Candidate set No. of
candidates
acessibility accessibility, accessi- 14
bly, basicity, bicy-
clists, bicyclist, itali-
cizes, abilities, bicy-
clist, sibilates, stabi-
lize, silicates, celiba-
cy, bicycles, and bi-
cycle.
yoor your, you, door, our, 59
or, yoga, yak, yuck,
yule, moon, tour,
poor, ...
Table 5: Example of candidate sets for OOV
words.
Our approach and SMT-like system attained
BLEU scores of 83.12 and 78.81, respectively.
This result proves that if we integrate three prob-
ability scores via Maxent, promising normaliza-
tion accuracy can be obtained. This result con-
firms that a normalization system constructed
based on the Maxent principle can surpass state-
of-the-art systems. However, several drawbacks
of our method were disclosed by inspecting the
output of the system. The most noticeable one is
that the approach fails when tackling very noisy
text, that is, ample usage of OOV words in a text.
We altered our dataset to have higher levels of
noise using an approach introduced by Gadde,
Goutam, Shah, Bayyarapu, and Subramaniam
(2011), which artificially generates OOV words.
If the percentage of OOV words crosses 45%,
the accuracy of the method drastically drops to a
BLEU score of less than 65. Another shortcom-
ing of our approach is that it is not able to ad-
dress combined words and abbreviations (e.g.
alot — a lot, btw — by the way) because candi-
date generation module forms only single words
for each OOV.
</bodyText>
<sectionHeader confidence="0.993753" genericHeader="conclusions">
7 Conclusion
</sectionHeader>
<bodyText confidence="0.999917954545455">
In this paper, we have presented a normalization
approach based on the maximum entropy model.
This approach provides a unified layout for in-
corporating different sources of features to nor-
malize Twitter messages. Our proposed approach
consists of three stages: preprocessing, candidate
generation, and candidate selection. The ap-
proach is robust to normalize unseen words since
its candidate generation stage does not practice
machine-learning methods. In the preprocessing
stage, after trimming erroneous whitespaces and
tokenization, OOV words are detected via the
GNU Aspell dictionary. Normalized candidates
are generated for each OOV word in the second
stage regarding to lexical, phonemic, and mor-
phophonemic similarities. Since code-switching
between Malay and English is very common in
our dataset, the potential English translation of
OOV words is also added to the candidate set.
In the third stage, three conditional probability
scores are assigned to each candidate: 1) posi-
tional indexing considers the probability of posi-
tional locations of term occurrences inside doc-
uments, 2) dependency-based frequency
measures the probability of prevalence of the
dependency relation of words to each other, and
3) the language model indicates the probability
of distribution of the sequence of words. Finally,
the best candidate is selected. Maximum entropy
integrates the obtained probability scores to es-
timate the ultimate probability of each candidate.
The approach is examined using 7,000 parallel
Twitter messages, which is split into 5,000 mes-
sages for training and 2,000 for testing. The re-
sult is promising whereby we achieve a BLEU
score of 83.12 against the baseline BLEU, which
scores 42.01. We have compared our approach
with a SMT-like approach using the same da-
taset. The accuracy of the SMT-like was lower
than our approach (i.e. 78.81 BLEU score for the
SMT-like). For future work, we will examine the
Maxent normalization approach with more prob-
ability functions, such as distributional clustering
and semantic features.
</bodyText>
<sectionHeader confidence="0.997484" genericHeader="acknowledgments">
Acknowledgments
</sectionHeader>
<bodyText confidence="0.994387">
The research for this paper was financially sup-
ported by the University of Malaya FRGS Grant
(FP021-2014B). We thank Asad Abdi for assis-
tance with graphical illustrations.
</bodyText>
<page confidence="0.997174">
25
</page>
<sectionHeader confidence="0.884413" genericHeader="references">
Reference
</sectionHeader>
<reference confidence="0.999868897196262">
Aw, A., Zhang, M., Xiao, J., &amp; Su, J. (2006). A
Phrase-based Statistical Model for SMS Text Nor-
malization. In Proceedings of the COLING/ACL
on Main Conference Poster Sessions (pp. 33–40).
Stroudsburg, PA, USA: Association for Computa-
tional Linguistics.
Beaufort, R., Roekhaut, S., Cougnon, L.-A., &amp; Fairon,
C. (2010). A Hybrid Rule/Model-based Finite-state
Framework for Normalizing SMS Messages. In
Proceedings of the 48th Annual Meeting of the As-
sociation for Computational Linguistics (pp. 770–
779). Stroudsburg, PA, USA: Association for
Computational Linguistics.
Berger, A. L., Pietra, V. J. Della, &amp; Pietra, S. A. Del-
la. (1996). A Maximum Entropy Approach to Nat-
ural Language Processing. Comput. Linguist.,
22(1), 39–71.
Bieswanger, M. (2007). 2 abbrevi8 or not 2 abbrevi8:
A Contrastive Analysis of Different Space- and
Time-Saving Strategies in English and German
Text Messages. Texas Linguistic Forum, Vol. 50.
Choudhury, M., Saraf, R., Jain, V., Sarkar, S., &amp;
Basu, A. (2007). Investigation and Modeling of the
Structure of Texting Language, 63–70.
Clark, E., &amp; Araki, K. (2011). Text Normalization in
Social Media: Progress, Problems and Applications
for a Pre-Processing System of Casual English.
Procedia - Social and Behavioral Sciences, 27(0),
2–11.
Contractor, D., Faruquie, T. A., &amp; Subramaniam, L.
V. (2010). Unsupervised Cleansing of Noisy Text.
In Proceedings of the 23rd International Confer-
ence on Computational Linguistics: Posters (pp.
189–196). Stroudsburg, PA, USA: Association for
Computational Linguistics.
Cook, P., &amp; Stevenson, S. (2009). An Unsupervised
Model for Text Message Normalization. In Pro-
ceedings of the Workshop on Computational Ap-
proaches to Linguistic Creativity (pp. 71–78).
Stroudsburg, PA, USA: Association for Computa-
tional Linguistics.
Daugherty, T., Eastin, M. S., &amp; Bright, L. (2008).
Exploring Consumer Motivations for Creating Us-
er-Generated Content. Journal of Interactive Ad-
vertising, 8(2).
Gadde, P., Goutam, R., Shah, R., Bayyarapu, H. S., &amp;
Subramaniam, L. V. (2011). Experiments with Ar-
tificially Generated Noise for Cleansing Noisy
Text. In Proceedings of the 2011 Joint Workshop
on Multilingual OCR and Analytics for Noisy Un-
structured Text Data (pp. 4:1–4:8). New York, NY,
USA: ACM. doi:10.1145/2034617.2034622
Han, B., &amp; Baldwin, T. (2011). Lexical Normalisation
of Short Text Messages: Makn Sens a #Twitter. In
Proceedings of the 49th Annual Meeting of the As-
sociation for Computational Linguistics: Human
Language Technologies - Volume 1 (pp. 368–378).
Stroudsburg, PA, USA: Association for Computa-
tional Linguistics.
Han, B., Cook, P., &amp; Baldwin, T. (2012). Automati-
cally Constructing a Normalisation Dictionary for
Microblogs. In Proceedings of the 2012 Joint Con-
ference on Empirical Methods in Natural Language
Processing and Computational Natural Language
Learning (pp. 421–432). Stroudsburg, PA, USA:
Association for Computational Linguistics.
Jelinek, F. (1990). Readings in Speech Recognition.
In A. Waibel &amp; K.-F. Lee (Eds.), (pp. 450–506).
San Francisco, CA, USA: Morgan Kaufmann Pub-
lishers Inc.
Kaufmann, M., &amp; Kalita, J. (2010). Syntactic normal-
ization of Twitter messages. International Confer-
ence on Natural Language Processing, Kharagpur,
India.
Kernighan, M. D., Church, K. W., &amp; Gale, W. A.
(1990). A Spelling Correction Program Based on a
Noisy Channel Model. In Proceedings of the 13th
Conference on Computational Linguistics - Vol-
ume 2 (pp. 205–210). Stroudsburg, PA, USA: As-
sociation for Computational Linguistics.
Kobus, C., Yvon, F., &amp; Damnati, G. (2008). Normal-
izing SMS: Are Two Metaphors Better Than One?
In Proceedings of the 22Nd International Confer-
ence on Computational Linguistics - Volume 1 (pp.
441–448). Stroudsburg, PA, USA: Association for
Computational Linguistics.
Koehn, P., Hoang, H., Birch, A., Callison-Burch, C.,
Federico, M., Bertoldi, N., ... Herbst, E. (2007).
Moses: Open Source Toolkit for Statistical Ma-
chine Translation. In Proceedings of the 45th An-
nual Meeting of the ACL on Interactive Poster and
Demonstration Sessions (pp. 177–180). Strouds-
burg, PA, USA: Association for Computational
Linguistics.
Lin, H., Bilmes, J., Vergyri, D., &amp; Kirchhoff, K.
(2007). OOV detection by joint word/phone lattice
alignment. In Automatic Speech Recognition Un-
derstanding, 2007. ASRU. IEEE Workshop on (pp.
478–483). doi:10.1109/ASRU.2007.4430159
Liu, F., Weng, F., Wang, B., &amp; Liu, Y. (2011). Inser-
tion, Deletion, or Substitution?: Normalizing Text
Messages Without Pre-categorization nor Supervi-
sion. In Proceedings of the 49th Annual Meeting of
the Association for Computational Linguistics:
Human Language Technologies: Short Papers -
Volume 2 (pp. 71–76). Stroudsburg, PA, USA: As-
sociation for Computational Linguistics.
</reference>
<page confidence="0.948464">
26
</page>
<reference confidence="0.999976572815534">
Lopez Ludeiia, V., San Segundo, R., Montero, J. M.,
Barra Chicote, R., &amp; Lorenzo, J. (2012). Architec-
ture for Text Normalization using Statistical Ma-
chine Translation techniques. In IberSPEECH 2012
(pp. 112–122). Madrid, Spain: Springer.
Manning, C. D., &amp; Raghavan, P. (2009). An Introduc-
tion to Information Retrieval. Online.
doi:10.1109/LPT.2009.2020494
Marneffe, M.-C. de, MacCartney, B., &amp; Manning, C.
D. (2006). Generating typed dependency parses
from phrase structure parses. In The International
Conference on Language Resources and Evaluation
(LREC) (pp. 449–454). Genova, Italy.
Mays, E., Damerau, F. J., &amp; Mercer, R. L. (1991).
Context based spelling correction. Information
Processing &amp; Management, 27(5), 517–522.
Novak, J., Yang, D., Minematsu, N., &amp; Hirose, K.
(2011). Phonetisaurus: A wfst-driven phoneticizer.
The University of Tokyo, Tokyo Institute of Tech-
nology. Retrieved January 1, 2014, from
http://code.google.com/p/phonetisaurus/
Och, F. J., &amp; Ney, H. (2002). Discriminative Training
and Maximum Entropy Models for Statistical Ma-
chine Translation. In Proceedings of the 40th An-
nual Meeting on Association for Computational
Linguistics (pp. 295–302). Stroudsburg, PA, USA:
Association for Computational Linguistics.
Och, F. J., &amp; Ney, H. (2003). A Systematic Compari-
son of Various Statistical Alignment Models.
Comput. Linguist., 29(1), 19–51.
Oliva, J., Serrano, J. I., Del Castillo, M. D., &amp; Igesias,
Á. (2013). A SMS Normalization System Integrat-
ing Multiple Grammatical Resources. Natural Lan-
guage Engineering, 19(01), 121–141.
Papineni, K., Roukos, S., &amp; Ward, T. (1998). Maxi-
mum likelihood and discriminative training of di-
rect translation models. In Acoustics, Speech and
Signal Processing, 1998. Proceedings of the 1998
IEEE International Conference on (Vol. 1, pp.
189–192 vol.1).
Papineni, K., Roukos, S., Ward, T., &amp; Zhu, W.-J.
(2002). BLEU: A Method for Automatic Evalua-
tion of Machine Translation. In Proceedings of the
40th Annual Meeting on Association for Computa-
tional Linguistics (pp. 311–318). Stroudsburg, PA,
USA: Association for Computational Linguistics.
Pennell, D. L., &amp; Liu, Y. (2010). Normalization of
text messages for text-to-speech. Acoustics Speech
and Signal Processing (ICASSP), 2010 IEEE In-
ternational Conference on.
Saloot, M. A., Idris, N., &amp; Aw, A. (2014). Noisy Text
Normalization Using an Enhanced Language Mod-
el. In Proceedings of the International Conference
on Artificial Intelligence and Pattern Recognition
(pp. 111–122). Kuala Lumpur, Malaysia: SDIWC.
Saloot, M. A., Idris, N., Aw, A., &amp; Thorleuchter, D.
(2014). Twitter corpus creation: The case of a Ma-
lay Chat-style-text Corpus (MCC). Digital Scholar-
ship in the Humanities. Retrieved from
http://dsh.oxfordjournals.org/content/early/2014/12
/13/llc.fqu066.abstract
Saloot, M. A., Idris, N., &amp; Mahmud, R. (2014). An
architecture for Malay Tweet normalization. In-
formation Processing &amp; Management, 50(5), 621–
633.
Schlippe, T., Zhu, C., Gebhardt, J., &amp; Schultz, T.
(2010). Text normalization based on statistical ma-
chine translation and internet user support. In T.
Kobayashi, K. Hirose, &amp; S. Nakamura (Eds.), IN-
TERSPEECH (pp. 1816–1819). ISCA.
Smith, J., &amp; Padi, P. (2006). Lets make a dictionary.
In Proceedings of the the Eighth Biennial Confer-
ence of the Borneo Research Council (BRC) (pp.
515–520). Sarawak, Malaysia: Borneo Research
Council (BRC).
Sproat, R., Black, A. W., Chen, S., Kumar, S., Osten-
dorf, M., &amp; Richards, C. (2001). Normalization of
non-standard words. Computer Speech &amp; Lan-
guage, 15(3), 287–333.
Stolcke, A. (2002). SRILM-an extensible language
modeling toolkit. In Proceedings International
Conference on Spoken Language Processing (pp.
257–286).
Streit, R. L., &amp; Luginbuhl, T. E. (1994). Maximum
likelihood training of probabilistic neural networks.
Neural Networks, IEEE Transactions on, 5(5),
764–783. doi:10.1109/72.317728
Thurlow, C., &amp; Brown, A. (2003). Generation Txt?
The sociolinguistics of young people’s text-
messaging.
Toutanova, K., &amp; Moore, R. C. (2002). Pronunciation
Modeling for Improved Spelling Correction. In
Proceedings of the 40th Annual Meeting on Asso-
ciation for Computational Linguistics (pp. 144–
151). Stroudsburg, PA, USA: Association for
Computational Linguistics.
Xue, Z., Yin, D., &amp; Davison, B. D. (2011a). Normal-
izing Microtext. In Analyzing Microtext (Vol. WS-
11–05). AAAI.
Xue, Z., Yin, D., &amp; Davison, B. D. (2011b). Normal-
izing Microtext. In Analyzing Microtext: Papers
from the 2011 AAAI Workshop (pp. 74–79). San
Francisco, CA, USA: AAAI.
</reference>
<page confidence="0.99881">
27
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.046706">
<title confidence="0.996836">Toward Tweets Normalization Using Maximum Entropy</title>
<author confidence="0.997806">Mohammad Arshi Saloot</author>
<affiliation confidence="0.9979765">Department of Artificial Intelligence, University of</affiliation>
<address confidence="0.998277">Malaya, 50603, Malaysia</address>
<email confidence="0.99982">phd_siamak@yahoo.com</email>
<author confidence="0.801173">Norisma</author>
<affiliation confidence="0.9987285">Department of Intelligence, University</affiliation>
<address confidence="0.999938">Malaya, 50603, Malaysia</address>
<email confidence="0.972593">norisma@um.edu.my</email>
<author confidence="0.54812">Liyana</author>
<affiliation confidence="0.997341">Department of System, University of</affiliation>
<address confidence="0.999004">50603, Malaysia</address>
<email confidence="0.98559">liyanashuib@um.edu.my</email>
<author confidence="0.999607">Ram Gopal</author>
<affiliation confidence="0.999418">Department of Intelligence, University</affiliation>
<address confidence="0.999928">Malaya, 50603, Malaysia</address>
<email confidence="0.95475">ramdr@um.edu.my</email>
<note confidence="0.858511">Corresponding author</note>
<abstract confidence="0.997779705882353">The use of social network services and microblogs, such as Twitter, has created valuable text resources, which contain extremely noisy text. Twitter messages contain so much noise that it is difficult to use them in natural language processing tasks. This paper presents a new approach using the maximum entropy model for normalizing Tweets. The proposed approach addresses words that are unseen in the training phase. Although the maximum entropy needs a training dataset to adjust its parameters, the proposed approach can normalize unseen data in the training set. The principle of maximum entropy emphasizes incorporating the available features into a uniform model. First, we generate a set of normalized candidates for each out-ofvocabulary word based on lexical, phonemic, and morphophonemic similarities. Then, three different probability scores are calculated for each candidate using positional indexing, a dependency-based frequency feature and a language model. After the optimal values of the model parameters are obtained in a training phase, the model can calculate the final probability value for candidates. The approach achieved an 83.12 BLEU score in testing using 2,000 Tweets. Our experimental results show that the maximum entropy apsignificantly outperforms previ-</abstract>
<author confidence="0.704573">AiTi Aw</author>
<affiliation confidence="0.841053">Institute for Infocomm Research (I2R),</affiliation>
<address confidence="0.479838">A*STAR, Singapore</address>
<email confidence="0.339611">aaiti@i2r.a-star.edu.sg</email>
<abstract confidence="0.860619">ous well-known normalization approaches.</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="true">
<authors>
<author>A Aw</author>
<author>M Zhang</author>
<author>J Xiao</author>
<author>J Su</author>
</authors>
<title>A Phrase-based Statistical Model for SMS Text Normalization.</title>
<date>2006</date>
<booktitle>In Proceedings of the COLING/ACL on Main Conference Poster Sessions</booktitle>
<pages>33--40</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA:</location>
<contexts>
<context position="24384" citStr="Aw et al. (2006)" startWordPosition="3881" endWordPosition="3884">results were achieved using a trigram LM and Backoff smoothing (Jelinek, 1990): 78.81 BLEU score. Table 4 indicates some statistics about our testing dataset. The OOV words are those detected by our OOV detection module. The BLEU score of raw text is an important measure to analyze the difficulty of the task. It is important to note that the dataset used in our experiment contains an above average number of OOV words compared to the datasets in other related papers. The dataset used by Kobus et al. (2008) consists of 32% OOV words, which is slightly lower than 34% of our dataset. In addition, Aw et al. (2006) used a dataset with a baseline BLEU score of 57.84, which indicates that the raw text is much more similar to the manual translated text (reference text) than the ones used in our experiment. Avg. length of words (character) 5 Avg. number of words 11 Total No. of tokens 19,759 OOV words 34.02% BLEU score of raw text 42.01 Table 4: Statistics of testing dataset. As shown in Table 4, the average length of words is five characters, which makes the normalization task more difficult. For example, the candidate set for the OOV word “yoor” contains 59 words, as shown in Table 5. The large number of </context>
</contexts>
<marker>Aw, Zhang, Xiao, Su, 2006</marker>
<rawString>Aw, A., Zhang, M., Xiao, J., &amp; Su, J. (2006). A Phrase-based Statistical Model for SMS Text Normalization. In Proceedings of the COLING/ACL on Main Conference Poster Sessions (pp. 33–40). Stroudsburg, PA, USA: Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>R Beaufort</author>
<author>S Roekhaut</author>
<author>L-A Cougnon</author>
<author>C Fairon</author>
</authors>
<title>A Hybrid Rule/Model-based Finite-state Framework for Normalizing SMS Messages.</title>
<date>2010</date>
<booktitle>In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics</booktitle>
<pages>770--779</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA:</location>
<contexts>
<context position="20233" citStr="Beaufort et al., 2010" startWordPosition="3163" endWordPosition="3166">bility measurement calculates the probabilities based on a language model. The cleansed part of our training dataset, which consists of more than 55,000 words, is fed into SRILM (Stolcke, 2002) to compile a bidirectional trigram LM by employing the Kneser-Ney smoothing algorithm. To calculate the probability of each candidate, we used a beam search decoder through the Moses decoder (Koehn et al., 2007). 5.2 Selecting the most probable candidate Previous works on spelling correction and normalization used the source channel model, which is also known as the noisy channel model and Naïve Bayes (Beaufort et al., 2010; Kernighan, Church, &amp; Gale, 1990; Mays, Damerau, &amp; Mercer, 1991; Toutanova &amp; Moore, 2002; Xue, Yin, &amp; Davison, 2011b). In the noisy channel approach, we observe the conversion of standard words to noisy words in a training phase in order to build a model. In the prediction phase, the decoder can select the most probable candidate based on the obtained model. The candidate selection is accomplished based on only two parameters: the LM and error model, which is computed as follows: G = arg max{ P(T |O )} = arg max Where T is a target word, 0 is an observed word, fm (T, O) is a feature function,</context>
</contexts>
<marker>Beaufort, Roekhaut, Cougnon, Fairon, 2010</marker>
<rawString>Beaufort, R., Roekhaut, S., Cougnon, L.-A., &amp; Fairon, C. (2010). A Hybrid Rule/Model-based Finite-state Framework for Normalizing SMS Messages. In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics (pp. 770– 779). Stroudsburg, PA, USA: Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>A L Berger</author>
<author>V J Della Pietra</author>
<author>S A Della Pietra</author>
</authors>
<title>A Maximum Entropy Approach to Natural Language Processing.</title>
<date>1996</date>
<journal>Comput. Linguist.,</journal>
<volume>22</volume>
<issue>1</issue>
<pages>39--71</pages>
<contexts>
<context position="4852" citStr="Berger, Pietra, &amp; Pietra, 1996" startWordPosition="724" endWordPosition="728">h as much brevity as possible. The normalization bears a resemblance to spelling correction. The ultimate goal of which is the detection and correction of OOV words. The spelling correction methods only focus on misspelled words while normalization systems consider all forms of OOV words, such as representing sounds phonetically (e.g. by the way — btw) and shortened forms (e.g. university — uni). Thus, normalization approaches should address a higher volume of OOV words compared to spelling correction approaches that lead to more complexity. To address this complexity, we use maximum entropy (Berger, Pietra, &amp; Pietra, 1996; Och &amp; Ney, 2002) for utilizing and incorporating more probability functions. Our approach is based on the hypothesis that integrating more probability functions will boost the performance of the method; however, the available information and number of probability functions for (OOV word, standard word) pairs are always limited. Maximum entropy (Maxent) provides a criterion for integrating probability distributions based on partial knowledge. The Maxent produces the lowest biased estimation on the given information, that is, it is maximally neutral regarding missing information. When defining</context>
</contexts>
<marker>Berger, Pietra, Pietra, 1996</marker>
<rawString>Berger, A. L., Pietra, V. J. Della, &amp; Pietra, S. A. Della. (1996). A Maximum Entropy Approach to Natural Language Processing. Comput. Linguist., 22(1), 39–71.</rawString>
</citation>
<citation valid="true">
<authors>
<author>M Bieswanger</author>
</authors>
<title>2 abbrevi8 or not 2 abbrevi8: A Contrastive Analysis of Different Space- and Time-Saving Strategies in English and German Text Messages. Texas Linguistic Forum,</title>
<date>2007</date>
<volume>50</volume>
<contexts>
<context position="2446" citStr="Bieswanger, 2007" startWordPosition="346" endWordPosition="347">itute for Infocomm Research (I2R), A*STAR, Singapore aaiti@i2r.a-star.edu.sg ous well-known normalization approaches. 1 Introduction The advent of Web 2.0 and electronic communications has enabled the extensive creation and dissemination of user-generated content (UGC). The UGC collections provide invaluable data sources in order to mine and extract beneficial information and knowledge, while, at the same time, resulting in less standardized language (Clark &amp; Araki, 2011; Daugherty, Eastin, &amp; Bright, 2008). However, such content diverges from standard writing conventions. As shown by experts (Bieswanger, 2007; Thurlow &amp; Brown, 2003), this divergence is due to the usage of a variety of coding strategies, including digit phonemes (you too — you2), phonetic transcriptions (you — u), vowel drops (dinner — dnnr), misspellings (convenience — convineince), and missing or incorrect punctuation marks (If I were you, I&apos;d probably go. — If I were you Id probably go). These alterations are due to three main parameters: 1) The small allowance of characters, 2) the constraints of the small keypads, and 3) using UGC in informal communications between friends and relatives. Whatever their causes, these alteration</context>
</contexts>
<marker>Bieswanger, 2007</marker>
<rawString>Bieswanger, M. (2007). 2 abbrevi8 or not 2 abbrevi8: A Contrastive Analysis of Different Space- and Time-Saving Strategies in English and German Text Messages. Texas Linguistic Forum, Vol. 50.</rawString>
</citation>
<citation valid="true">
<authors>
<author>M Choudhury</author>
<author>R Saraf</author>
<author>V Jain</author>
<author>S Sarkar</author>
<author>A Basu</author>
</authors>
<date>2007</date>
<booktitle>Investigation and Modeling of the Structure of Texting Language,</booktitle>
<pages>63--70</pages>
<marker>Choudhury, Saraf, Jain, Sarkar, Basu, 2007</marker>
<rawString>Choudhury, M., Saraf, R., Jain, V., Sarkar, S., &amp; Basu, A. (2007). Investigation and Modeling of the Structure of Texting Language, 63–70.</rawString>
</citation>
<citation valid="true">
<authors>
<author>E Clark</author>
<author>K Araki</author>
</authors>
<title>Text Normalization in Social Media: Progress, Problems and Applications for a Pre-Processing System of Casual English. Procedia - Social and Behavioral Sciences,</title>
<date>2011</date>
<volume>27</volume>
<issue>0</issue>
<pages>2--11</pages>
<contexts>
<context position="2305" citStr="Clark &amp; Araki, 2011" startWordPosition="325" endWordPosition="328">core in testing using 2,000 Tweets. Our experimental results show that the maximum entropy approach significantly outperforms previAiTi Aw* Institute for Infocomm Research (I2R), A*STAR, Singapore aaiti@i2r.a-star.edu.sg ous well-known normalization approaches. 1 Introduction The advent of Web 2.0 and electronic communications has enabled the extensive creation and dissemination of user-generated content (UGC). The UGC collections provide invaluable data sources in order to mine and extract beneficial information and knowledge, while, at the same time, resulting in less standardized language (Clark &amp; Araki, 2011; Daugherty, Eastin, &amp; Bright, 2008). However, such content diverges from standard writing conventions. As shown by experts (Bieswanger, 2007; Thurlow &amp; Brown, 2003), this divergence is due to the usage of a variety of coding strategies, including digit phonemes (you too — you2), phonetic transcriptions (you — u), vowel drops (dinner — dnnr), misspellings (convenience — convineince), and missing or incorrect punctuation marks (If I were you, I&apos;d probably go. — If I were you Id probably go). These alterations are due to three main parameters: 1) The small allowance of characters, 2) the constra</context>
<context position="8721" citStr="Clark &amp; Araki, 2011" startWordPosition="1312" endWordPosition="1315">ervised method using probabilistic models for only three common abbreviation types: stylistic variation, prefix clipping, and subsequence abbreviation. In addition, Beaufort, Roekhaut, Cougnon, and Fairon (2010) merged 20 the SMT-like and the spell checking approaches to normalize French SMSs. The third group is the dictionary based normalization approach, which is an easy-to-use and fast solution. This approach requires a dictionary whose entries are OOV and standard form pairs. It has been proven that using a colloquial dictionary can outperform some state-of-the-art and complex approaches (Clark &amp; Araki, 2011; Saloot, Idris, &amp; Mahmud, 2014). However, its performance highly relies on the size of the dictionary. Therefore, Han, Cook, and Baldwin (2012) introduced a method to automatically compile a large dictionary. To address the shortcomings of the dictionary approach, Oliva, Serrano, Del Castillo, and Igesias (2013) introduced a special Spanish phonetic dictionary, in which each entry is formed by a coded consonant string, vowels strings, and their positions in the word, for normalizing Spanish SMS texts. The fourth group resembles automatic speech recognition (ASR) systems. This paradigm consist</context>
<context position="11433" citStr="Clark &amp; Araki, 2011" startWordPosition="1724" endWordPosition="1727">datasets. For example, the PTB-Tokenizer is a fast, deterministic, and efficient tokenization method. On the other hand, UGC text demands special methods due to irregularities in its whitespaces and punctuation. As suggested by Lopez Ludeña et al. (2012), we employ a straightforward word separating method, which performs tokenization based on whitespace characters. One of the most important primary steps in unsupervised normalization systems is to detect OOV words. Hanspell and GNU Aspell are two well-known spell checker systems, however, Aspell performance is more accurate on the noisy text (Clark &amp; Araki, 2011). The Aspell dictionary is utilized to distinguish between OOV and standard English words. In addition, we used seven regular expression rules, which were introduced by Saloot, Idris, and Aw (2014). This helps to detect proper nouns, email and URL addresses, Twitter special symbols, and digits. The potential errors in the OOV word detection step would not affect the performance of the normalization system since the detected OOV word will be included in the candidate set. 4 Candidate generation For each given OOV word, a set of normalized candidates is generated via four different modules. The </context>
</contexts>
<marker>Clark, Araki, 2011</marker>
<rawString>Clark, E., &amp; Araki, K. (2011). Text Normalization in Social Media: Progress, Problems and Applications for a Pre-Processing System of Casual English. Procedia - Social and Behavioral Sciences, 27(0), 2–11.</rawString>
</citation>
<citation valid="true">
<authors>
<author>D Contractor</author>
<author>T A Faruquie</author>
<author>L V Subramaniam</author>
</authors>
<title>Unsupervised Cleansing of Noisy Text.</title>
<date>2010</date>
<booktitle>In Proceedings of the 23rd International Conference on Computational Linguistics: Posters</booktitle>
<pages>189--196</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA:</location>
<contexts>
<context position="21414" citStr="Contractor, Faruquie, &amp; Subramaniam, 2010" startWordPosition="3372" endWordPosition="3376">ord, 0 is an observed word, fm (T, O) is a feature function, M is a number of total feature functions, and X is a Lagrange multiplier of each function. In our case, M equals three, in which f1 is the positional indexing, f2 is the dependency-based frequency feature, and f3 is the LM probability. The Maxent requires X being determined in the training phase before the actual usage. 6 Experimental results and discussion We evaluate our approach in terms of BLEU score (Papineni, Roukos, Ward, &amp; Zhu, 2002), since BLEU has become a well-known and adequate evaluation metric in normalization studies (Contractor, Faruquie, &amp; Subramaniam, 2010; Schlippe, Zhu, Gebhardt, &amp; Schultz, 2010). The achieved baseline for the testing dataset is 42.01 BLEU score, that is, the volume of similarity between the testing text and the reference text (manually normalized text) in term of BLEU score. In the training phase, we performed maximum likelihood training (Papineni, Roukos, &amp; Ward, 1998; Streit &amp; Luginbuhl, 1994) for X1, X2 and X3 between 0.0 and 1.0. Figure 1 shows the tolerance of the performance while transition of X1 and X2 (when X3 is fixed to 1.0). Figure 1 depicts that the value of performance achieves the high(T, O) Y M ∑ Am •fm m </context>
</contexts>
<marker>Contractor, Faruquie, Subramaniam, 2010</marker>
<rawString>Contractor, D., Faruquie, T. A., &amp; Subramaniam, L. V. (2010). Unsupervised Cleansing of Noisy Text. In Proceedings of the 23rd International Conference on Computational Linguistics: Posters (pp. 189–196). Stroudsburg, PA, USA: Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>P Cook</author>
<author>S Stevenson</author>
</authors>
<title>An Unsupervised Model for Text Message Normalization.</title>
<date>2009</date>
<booktitle>In Proceedings of the Workshop on Computational Approaches to Linguistic Creativity</booktitle>
<pages>71--78</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA:</location>
<contexts>
<context position="8060" citStr="Cook and Stevenson (2009)" startWordPosition="1215" endWordPosition="1218">Chicote, &amp; Lorenzo, 2012). For example, Kaufmann and Kalita (2010) used the SMT-like approach to normalize English Tweets. To normalize SMS language, a supervised noisy channel model was introduced by Choudhury, Saraf, Jain, Sarkar, and Basu (2007) that used a hidden Markov model (HMM). This approach mimics the spell checking task that tries to handle the normalization problem via noisy channel models that study the UGC text as a noisy version of standard language. This paradigm has been scrutinized and enhanced by other researchers (Liu et al., 2011; Xue, Yin, &amp; Davison, 2011a). For example, Cook and Stevenson (2009) modified this approach to design an unsupervised method using probabilistic models for only three common abbreviation types: stylistic variation, prefix clipping, and subsequence abbreviation. In addition, Beaufort, Roekhaut, Cougnon, and Fairon (2010) merged 20 the SMT-like and the spell checking approaches to normalize French SMSs. The third group is the dictionary based normalization approach, which is an easy-to-use and fast solution. This approach requires a dictionary whose entries are OOV and standard form pairs. It has been proven that using a colloquial dictionary can outperform some</context>
</contexts>
<marker>Cook, Stevenson, 2009</marker>
<rawString>Cook, P., &amp; Stevenson, S. (2009). An Unsupervised Model for Text Message Normalization. In Proceedings of the Workshop on Computational Approaches to Linguistic Creativity (pp. 71–78). Stroudsburg, PA, USA: Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>T Daugherty</author>
<author>M S Eastin</author>
<author>L Bright</author>
</authors>
<title>Exploring Consumer Motivations for Creating User-Generated Content.</title>
<date>2008</date>
<journal>Journal of Interactive Advertising,</journal>
<volume>8</volume>
<issue>2</issue>
<contexts>
<context position="2340" citStr="Daugherty, Eastin, &amp; Bright, 2008" startWordPosition="329" endWordPosition="333"> 2,000 Tweets. Our experimental results show that the maximum entropy approach significantly outperforms previAiTi Aw* Institute for Infocomm Research (I2R), A*STAR, Singapore aaiti@i2r.a-star.edu.sg ous well-known normalization approaches. 1 Introduction The advent of Web 2.0 and electronic communications has enabled the extensive creation and dissemination of user-generated content (UGC). The UGC collections provide invaluable data sources in order to mine and extract beneficial information and knowledge, while, at the same time, resulting in less standardized language (Clark &amp; Araki, 2011; Daugherty, Eastin, &amp; Bright, 2008). However, such content diverges from standard writing conventions. As shown by experts (Bieswanger, 2007; Thurlow &amp; Brown, 2003), this divergence is due to the usage of a variety of coding strategies, including digit phonemes (you too — you2), phonetic transcriptions (you — u), vowel drops (dinner — dnnr), misspellings (convenience — convineince), and missing or incorrect punctuation marks (If I were you, I&apos;d probably go. — If I were you Id probably go). These alterations are due to three main parameters: 1) The small allowance of characters, 2) the constraints of the small keypads, and 3) u</context>
</contexts>
<marker>Daugherty, Eastin, Bright, 2008</marker>
<rawString>Daugherty, T., Eastin, M. S., &amp; Bright, L. (2008). Exploring Consumer Motivations for Creating User-Generated Content. Journal of Interactive Advertising, 8(2).</rawString>
</citation>
<citation valid="true">
<authors>
<author>P Gadde</author>
<author>R Goutam</author>
<author>R Shah</author>
<author>H S Bayyarapu</author>
<author>L V Subramaniam</author>
</authors>
<title>Experiments with Artificially Generated Noise for Cleansing Noisy Text.</title>
<date>2011</date>
<booktitle>In Proceedings of the 2011 Joint Workshop on Multilingual OCR and Analytics for Noisy Unstructured Text Data</booktitle>
<pages>4--1</pages>
<publisher>ACM.</publisher>
<location>New York, NY, USA:</location>
<marker>Gadde, Goutam, Shah, Bayyarapu, Subramaniam, 2011</marker>
<rawString>Gadde, P., Goutam, R., Shah, R., Bayyarapu, H. S., &amp; Subramaniam, L. V. (2011). Experiments with Artificially Generated Noise for Cleansing Noisy Text. In Proceedings of the 2011 Joint Workshop on Multilingual OCR and Analytics for Noisy Unstructured Text Data (pp. 4:1–4:8). New York, NY, USA: ACM. doi:10.1145/2034617.2034622</rawString>
</citation>
<citation valid="true">
<authors>
<author>B Han</author>
<author>T Baldwin</author>
</authors>
<title>Lexical Normalisation of Short Text Messages: Makn Sens a #Twitter.</title>
<date>2011</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies -</booktitle>
<volume>1</volume>
<pages>368--378</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA:</location>
<contexts>
<context position="9843" citStr="Han and Baldwin (2011)" startWordPosition="1484" endWordPosition="1487"> texts. The fourth group resembles automatic speech recognition (ASR) systems. This paradigm consists of three steps: 1) converting the text to strings of phonemes via letter-to-phone rules, 2) converting the strings of phonemes to words via pronunciation dictionaries, and 3) choosing the most probable words. The ASR-like approach has been merged with other approaches to boost its performance. Kobus, Yvon, and Damnati (2008) combined ASR-like and SMT-like approaches to normalize French SMSs. Lin, Bilmes, Vergyri, and Kirchhoff (2007) used this approach to detect OOV words in switchboard data. Han and Baldwin (2011) illustrated a lexical method for normalizing Twitter messages. After detecting OOVs, ill-formed words, and generating a set of candidates, the best candidate is selected using a variety of metrics: lexical edit distance, phonemic edit distance, longest common subsequence (LCS), affix substring, language model, and dependency-based frequency features. The method achieved a 93.4 BLEU score in normalizing 549 English Tweets. This inspired us to design a normalization method that has three major stages: preprocessing, candidate generation, and candidate selection. 3 Preprocessing First, we perfor</context>
<context position="12256" citStr="Han and Baldwin (2011)" startWordPosition="1857" endWordPosition="1860">is helps to detect proper nouns, email and URL addresses, Twitter special symbols, and digits. The potential errors in the OOV word detection step would not affect the performance of the normalization system since the detected OOV word will be included in the candidate set. 4 Candidate generation For each given OOV word, a set of normalized candidates is generated via four different modules. The first module executes a lexical candidate generation, which is extensively utilized in spell checker systems. It calculates candidates within a distance of T edit operations of the detected OOV words. Han and Baldwin (2011) stated that when T is less than or equal to two, the level of recall is high enough. The edit distance is the number of applied edits in changing one word to another. An edit could be a deletion, transposition, alteration, or insertion. Studies in spelling correction found that one lexical edit distance covers 80% to 95% of errors, and two lexical edit distances cover 98% of them. Therefore, here we use lexical variations with less than or equal to two edit distances. For a word of length n characters, 54n + 25 combinations will be generated with one lexical edit distance using four reshaping</context>
<context position="18369" citStr="Han &amp; Baldwin, 2011" startWordPosition="2851" endWordPosition="2854"> 11 Table 2: An example of the positional indexes obtained. A probability score is assigned to the normalized candidate according to a comparison between the position of the candidate and positional indexes in the dataset. We look for the candidate in the dataset where there is an occurrence of the candidate with its position index. After aggregating the number of occurrences, we normalize it between 0.0 and 1.0. The next probability calculation method is the dependency-based frequency, which is an augmentation of the previous method. Inspired by a work on the lexical normalization of Tweets (Han &amp; Baldwin, 2011), the noisy portion of our training dataset is parsed to obtain a dependency bank using our adapted version of the Stanford dependency parser (Marneffe, MacCartney, &amp; Manning, 2006). Since our aim is not to perform actual dependency parsing, the dependency types are not extracted. A cleansed corpus is not utilized because the percentage of IV words is high enough in the corpus, and in the probabilitymeasuring phase, OOV words are already detected. For example, from a sentence such as “I will go to London by next week,” (next, go +3) is obtained, indicating that next appears two words after go.</context>
</contexts>
<marker>Han, Baldwin, 2011</marker>
<rawString>Han, B., &amp; Baldwin, T. (2011). Lexical Normalisation of Short Text Messages: Makn Sens a #Twitter. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies - Volume 1 (pp. 368–378). Stroudsburg, PA, USA: Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>B Han</author>
<author>P Cook</author>
<author>T Baldwin</author>
</authors>
<title>Automatically Constructing a Normalisation Dictionary for Microblogs.</title>
<date>2012</date>
<booktitle>In Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning</booktitle>
<pages>421--432</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA:</location>
<contexts>
<context position="16909" citStr="Han et al., 2012" startWordPosition="2599" endWordPosition="2602">ation agreement as an indicator. The experts were instructed to produce a text that is as close to standard English as possible, but leaves the Twitter special symbols (e.g. #topic and @username) as is. The dataset was split into two parts: 5,000 messages for the training phase, and 2,000 messages for the testing phase. 5.1 Calculation of probability scores In order to select the most suitable candidates, we calculate their conditional probability scores using, positional indexing, a dependency-based frequency feature, and a language model (LM). Inspired by work on a normalization dictionary (Han et al., 2012), the first method to calculate the probability score of the candidates is the positional indexing, which is widely used in information retrieval systems. The positional indexing deals with positional locations of term occurrences inside documents. To compile a positional index dataset, a method illustrated in Manning and Raghavan (2009) is applied on a cleansed portion of our Twitter corpus. Table 2 refers to an example of our achieved positional index dataset. Each Twitter message is considered as a single document, and, hence, a unique document ID is assigned to each document. The frequency</context>
</contexts>
<marker>Han, Cook, Baldwin, 2012</marker>
<rawString>Han, B., Cook, P., &amp; Baldwin, T. (2012). Automatically Constructing a Normalisation Dictionary for Microblogs. In Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning (pp. 421–432). Stroudsburg, PA, USA: Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>F Jelinek</author>
</authors>
<title>Readings in Speech Recognition. In</title>
<date>1990</date>
<pages>450--506</pages>
<publisher>Morgan Kaufmann Publishers Inc.</publisher>
<location>San Francisco, CA, USA:</location>
<contexts>
<context position="23846" citStr="Jelinek, 1990" startWordPosition="3784" endWordPosition="3785">anslation (SMT) is a cutting-edge approach that handles the normalization problem as a statistical machine translation task; it was first introduced by Aw, Zhang, Xiao, and Su (2006). The SMT-like approach translates a source language (UGC) to a target language (standard language). The experiment was performed using Moses (Koehn et al., 2007) for statistical translation, Giza++ (Och &amp; Ney, 2003) for word alignment, and SRILM (Stolcke, 2002) for LM compiling. The SMT system is trained using our Twitter aligned dataset. The optimum results were achieved using a trigram LM and Backoff smoothing (Jelinek, 1990): 78.81 BLEU score. Table 4 indicates some statistics about our testing dataset. The OOV words are those detected by our OOV detection module. The BLEU score of raw text is an important measure to analyze the difficulty of the task. It is important to note that the dataset used in our experiment contains an above average number of OOV words compared to the datasets in other related papers. The dataset used by Kobus et al. (2008) consists of 32% OOV words, which is slightly lower than 34% of our dataset. In addition, Aw et al. (2006) used a dataset with a baseline BLEU score of 57.84, which ind</context>
</contexts>
<marker>Jelinek, 1990</marker>
<rawString>Jelinek, F. (1990). Readings in Speech Recognition. In A. Waibel &amp; K.-F. Lee (Eds.), (pp. 450–506). San Francisco, CA, USA: Morgan Kaufmann Publishers Inc.</rawString>
</citation>
<citation valid="true">
<authors>
<author>M Kaufmann</author>
<author>J Kalita</author>
</authors>
<title>Syntactic normalization of Twitter messages.</title>
<date>2010</date>
<booktitle>International Conference on Natural Language Processing,</booktitle>
<location>Kharagpur, India.</location>
<contexts>
<context position="7501" citStr="Kaufmann and Kalita (2010)" startWordPosition="1125" endWordPosition="1128">ary and future works. 2 Related work The normalization approaches can be categorized into four groups. The first group is called statistical machine translation (SMT) paradigm that addresses the normalization problem as a statistical machine translation task. This paradigm was first introduced by Aw, Zhang, Xiao and Su (2006) to normalize SMS text that translates a source language (UGC) to a target language (standard language). This paradigm has since been re-examined, expanded and improved by other researchers (Lopez Ludeña, San Segundo, Montero, Barra Chicote, &amp; Lorenzo, 2012). For example, Kaufmann and Kalita (2010) used the SMT-like approach to normalize English Tweets. To normalize SMS language, a supervised noisy channel model was introduced by Choudhury, Saraf, Jain, Sarkar, and Basu (2007) that used a hidden Markov model (HMM). This approach mimics the spell checking task that tries to handle the normalization problem via noisy channel models that study the UGC text as a noisy version of standard language. This paradigm has been scrutinized and enhanced by other researchers (Liu et al., 2011; Xue, Yin, &amp; Davison, 2011a). For example, Cook and Stevenson (2009) modified this approach to design an unsu</context>
</contexts>
<marker>Kaufmann, Kalita, 2010</marker>
<rawString>Kaufmann, M., &amp; Kalita, J. (2010). Syntactic normalization of Twitter messages. International Conference on Natural Language Processing, Kharagpur, India.</rawString>
</citation>
<citation valid="true">
<authors>
<author>M D Kernighan</author>
<author>K W Church</author>
<author>W A Gale</author>
</authors>
<title>A Spelling Correction Program Based on a Noisy Channel Model.</title>
<date>1990</date>
<booktitle>In Proceedings of the 13th Conference on Computational Linguistics - Volume</booktitle>
<volume>2</volume>
<pages>205--210</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA:</location>
<contexts>
<context position="20266" citStr="Kernighan, Church, &amp; Gale, 1990" startWordPosition="3167" endWordPosition="3171">ulates the probabilities based on a language model. The cleansed part of our training dataset, which consists of more than 55,000 words, is fed into SRILM (Stolcke, 2002) to compile a bidirectional trigram LM by employing the Kneser-Ney smoothing algorithm. To calculate the probability of each candidate, we used a beam search decoder through the Moses decoder (Koehn et al., 2007). 5.2 Selecting the most probable candidate Previous works on spelling correction and normalization used the source channel model, which is also known as the noisy channel model and Naïve Bayes (Beaufort et al., 2010; Kernighan, Church, &amp; Gale, 1990; Mays, Damerau, &amp; Mercer, 1991; Toutanova &amp; Moore, 2002; Xue, Yin, &amp; Davison, 2011b). In the noisy channel approach, we observe the conversion of standard words to noisy words in a training phase in order to build a model. In the prediction phase, the decoder can select the most probable candidate based on the obtained model. The candidate selection is accomplished based on only two parameters: the LM and error model, which is computed as follows: G = arg max{ P(T |O )} = arg max Where T is a target word, 0 is an observed word, fm (T, O) is a feature function, M is a number of total feature f</context>
</contexts>
<marker>Kernighan, Church, Gale, 1990</marker>
<rawString>Kernighan, M. D., Church, K. W., &amp; Gale, W. A. (1990). A Spelling Correction Program Based on a Noisy Channel Model. In Proceedings of the 13th Conference on Computational Linguistics - Volume 2 (pp. 205–210). Stroudsburg, PA, USA: Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>C Kobus</author>
<author>F Yvon</author>
<author>G Damnati</author>
</authors>
<title>Normalizing SMS: Are Two Metaphors Better Than One?</title>
<date>2008</date>
<booktitle>In Proceedings of the 22Nd International Conference on Computational Linguistics -</booktitle>
<volume>1</volume>
<pages>441--448</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA:</location>
<contexts>
<context position="24278" citStr="Kobus et al. (2008)" startWordPosition="3861" endWordPosition="3864">M (Stolcke, 2002) for LM compiling. The SMT system is trained using our Twitter aligned dataset. The optimum results were achieved using a trigram LM and Backoff smoothing (Jelinek, 1990): 78.81 BLEU score. Table 4 indicates some statistics about our testing dataset. The OOV words are those detected by our OOV detection module. The BLEU score of raw text is an important measure to analyze the difficulty of the task. It is important to note that the dataset used in our experiment contains an above average number of OOV words compared to the datasets in other related papers. The dataset used by Kobus et al. (2008) consists of 32% OOV words, which is slightly lower than 34% of our dataset. In addition, Aw et al. (2006) used a dataset with a baseline BLEU score of 57.84, which indicates that the raw text is much more similar to the manual translated text (reference text) than the ones used in our experiment. Avg. length of words (character) 5 Avg. number of words 11 Total No. of tokens 19,759 OOV words 34.02% BLEU score of raw text 42.01 Table 4: Statistics of testing dataset. As shown in Table 4, the average length of words is five characters, which makes the normalization task more difficult. For examp</context>
</contexts>
<marker>Kobus, Yvon, Damnati, 2008</marker>
<rawString>Kobus, C., Yvon, F., &amp; Damnati, G. (2008). Normalizing SMS: Are Two Metaphors Better Than One? In Proceedings of the 22Nd International Conference on Computational Linguistics - Volume 1 (pp. 441–448). Stroudsburg, PA, USA: Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>P Koehn</author>
<author>H Hoang</author>
<author>A Birch</author>
<author>C Callison-Burch</author>
<author>M Federico</author>
<author>N Bertoldi</author>
</authors>
<title>Moses: Open Source Toolkit for Statistical Machine Translation.</title>
<date>2007</date>
<booktitle>In Proceedings of the 45th Annual Meeting of the ACL on Interactive Poster and Demonstration Sessions</booktitle>
<pages>177--180</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA:</location>
<contexts>
<context position="20017" citStr="Koehn et al., 2007" startWordPosition="3128" endWordPosition="3131">for each candidate within a context window of two words on either side. The obtained relative position of a candidate is compared with the existing confidence score in the dependency bank. The third method of probability measurement calculates the probabilities based on a language model. The cleansed part of our training dataset, which consists of more than 55,000 words, is fed into SRILM (Stolcke, 2002) to compile a bidirectional trigram LM by employing the Kneser-Ney smoothing algorithm. To calculate the probability of each candidate, we used a beam search decoder through the Moses decoder (Koehn et al., 2007). 5.2 Selecting the most probable candidate Previous works on spelling correction and normalization used the source channel model, which is also known as the noisy channel model and Naïve Bayes (Beaufort et al., 2010; Kernighan, Church, &amp; Gale, 1990; Mays, Damerau, &amp; Mercer, 1991; Toutanova &amp; Moore, 2002; Xue, Yin, &amp; Davison, 2011b). In the noisy channel approach, we observe the conversion of standard words to noisy words in a training phase in order to build a model. In the prediction phase, the decoder can select the most probable candidate based on the obtained model. The candidate selectio</context>
<context position="23576" citStr="Koehn et al., 2007" startWordPosition="3740" endWordPosition="3743">the state-of-the-art approaches using our dataset. 6-fold cross validation BLEU score Round 1 80.99 Round 2 81.57 Round 3 84.82 Round 4 83.91 Round 5 83.90 Round 6 83.55 Average 83.12 Table 3: Normalization results for 6-fold cross validation test. The statistical machine translation (SMT) is a cutting-edge approach that handles the normalization problem as a statistical machine translation task; it was first introduced by Aw, Zhang, Xiao, and Su (2006). The SMT-like approach translates a source language (UGC) to a target language (standard language). The experiment was performed using Moses (Koehn et al., 2007) for statistical translation, Giza++ (Och &amp; Ney, 2003) for word alignment, and SRILM (Stolcke, 2002) for LM compiling. The SMT system is trained using our Twitter aligned dataset. The optimum results were achieved using a trigram LM and Backoff smoothing (Jelinek, 1990): 78.81 BLEU score. Table 4 indicates some statistics about our testing dataset. The OOV words are those detected by our OOV detection module. The BLEU score of raw text is an important measure to analyze the difficulty of the task. It is important to note that the dataset used in our experiment contains an above average number </context>
</contexts>
<marker>Koehn, Hoang, Birch, Callison-Burch, Federico, Bertoldi, 2007</marker>
<rawString>Koehn, P., Hoang, H., Birch, A., Callison-Burch, C., Federico, M., Bertoldi, N., ... Herbst, E. (2007). Moses: Open Source Toolkit for Statistical Machine Translation. In Proceedings of the 45th Annual Meeting of the ACL on Interactive Poster and Demonstration Sessions (pp. 177–180). Stroudsburg, PA, USA: Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>H Lin</author>
<author>J Bilmes</author>
<author>D Vergyri</author>
<author>K Kirchhoff</author>
</authors>
<title>OOV detection by joint word/phone lattice alignment. In Automatic Speech Recognition Understanding,</title>
<date>2007</date>
<booktitle>ASRU. IEEE Workshop on</booktitle>
<pages>478--483</pages>
<marker>Lin, Bilmes, Vergyri, Kirchhoff, 2007</marker>
<rawString>Lin, H., Bilmes, J., Vergyri, D., &amp; Kirchhoff, K. (2007). OOV detection by joint word/phone lattice alignment. In Automatic Speech Recognition Understanding, 2007. ASRU. IEEE Workshop on (pp. 478–483). doi:10.1109/ASRU.2007.4430159</rawString>
</citation>
<citation valid="true">
<authors>
<author>F Liu</author>
<author>F Weng</author>
<author>B Wang</author>
<author>Y Liu</author>
</authors>
<title>Insertion, Deletion, or Substitution?: Normalizing Text Messages Without Pre-categorization nor Supervision.</title>
<date>2011</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies: Short Papers -Volume</booktitle>
<volume>2</volume>
<pages>71--76</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA:</location>
<contexts>
<context position="7991" citStr="Liu et al., 2011" startWordPosition="1204" endWordPosition="1207">ther researchers (Lopez Ludeña, San Segundo, Montero, Barra Chicote, &amp; Lorenzo, 2012). For example, Kaufmann and Kalita (2010) used the SMT-like approach to normalize English Tweets. To normalize SMS language, a supervised noisy channel model was introduced by Choudhury, Saraf, Jain, Sarkar, and Basu (2007) that used a hidden Markov model (HMM). This approach mimics the spell checking task that tries to handle the normalization problem via noisy channel models that study the UGC text as a noisy version of standard language. This paradigm has been scrutinized and enhanced by other researchers (Liu et al., 2011; Xue, Yin, &amp; Davison, 2011a). For example, Cook and Stevenson (2009) modified this approach to design an unsupervised method using probabilistic models for only three common abbreviation types: stylistic variation, prefix clipping, and subsequence abbreviation. In addition, Beaufort, Roekhaut, Cougnon, and Fairon (2010) merged 20 the SMT-like and the spell checking approaches to normalize French SMSs. The third group is the dictionary based normalization approach, which is an easy-to-use and fast solution. This approach requires a dictionary whose entries are OOV and standard form pairs. It h</context>
</contexts>
<marker>Liu, Weng, Wang, Liu, 2011</marker>
<rawString>Liu, F., Weng, F., Wang, B., &amp; Liu, Y. (2011). Insertion, Deletion, or Substitution?: Normalizing Text Messages Without Pre-categorization nor Supervision. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies: Short Papers -Volume 2 (pp. 71–76). Stroudsburg, PA, USA: Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Lopez Ludeiia</author>
<author>San Segundo V</author>
<author>R Montero</author>
<author>J M</author>
<author>Barra Chicote</author>
<author>R</author>
<author>J Lorenzo</author>
</authors>
<title>Architecture for Text Normalization using Statistical Machine Translation techniques.</title>
<date>2012</date>
<booktitle>In IberSPEECH</booktitle>
<pages>112--122</pages>
<publisher>Springer.</publisher>
<location>Madrid, Spain:</location>
<marker>Ludeiia, V, Montero, M, Chicote, R, Lorenzo, 2012</marker>
<rawString>Lopez Ludeiia, V., San Segundo, R., Montero, J. M., Barra Chicote, R., &amp; Lorenzo, J. (2012). Architecture for Text Normalization using Statistical Machine Translation techniques. In IberSPEECH 2012 (pp. 112–122). Madrid, Spain: Springer.</rawString>
</citation>
<citation valid="true">
<authors>
<author>C D Manning</author>
<author>P Raghavan</author>
</authors>
<title>An Introduction to Information Retrieval.</title>
<date>2009</date>
<note>Online. doi:10.1109/LPT.2009.2020494</note>
<contexts>
<context position="17248" citStr="Manning and Raghavan (2009)" startWordPosition="2652" endWordPosition="2655">culation of probability scores In order to select the most suitable candidates, we calculate their conditional probability scores using, positional indexing, a dependency-based frequency feature, and a language model (LM). Inspired by work on a normalization dictionary (Han et al., 2012), the first method to calculate the probability score of the candidates is the positional indexing, which is widely used in information retrieval systems. The positional indexing deals with positional locations of term occurrences inside documents. To compile a positional index dataset, a method illustrated in Manning and Raghavan (2009) is applied on a cleansed portion of our Twitter corpus. Table 2 refers to an example of our achieved positional index dataset. Each Twitter message is considered as a single document, and, hence, a unique document ID is assigned to each document. The frequency value indicates the total number of appearances of a word in a document. The position values express the locations of the word in the document. 22 Vocab Document ID. Frequency Position have 1 2 4,9 4 3 5, 11, 18 are 5 1 2 12 2 2, 9 14 2 2, 11 Table 2: An example of the positional indexes obtained. A probability score is assigned to the </context>
</contexts>
<marker>Manning, Raghavan, 2009</marker>
<rawString>Manning, C. D., &amp; Raghavan, P. (2009). An Introduction to Information Retrieval. Online. doi:10.1109/LPT.2009.2020494</rawString>
</citation>
<citation valid="true">
<authors>
<author>M-C de Marneffe</author>
<author>B MacCartney</author>
<author>C D Manning</author>
</authors>
<title>Generating typed dependency parses from phrase structure parses.</title>
<date>2006</date>
<booktitle>In The International Conference on Language Resources and Evaluation (LREC)</booktitle>
<pages>449--454</pages>
<location>Genova, Italy.</location>
<contexts>
<context position="18549" citStr="Marneffe, MacCartney, &amp; Manning, 2006" startWordPosition="2878" endWordPosition="2882">ion of the candidate and positional indexes in the dataset. We look for the candidate in the dataset where there is an occurrence of the candidate with its position index. After aggregating the number of occurrences, we normalize it between 0.0 and 1.0. The next probability calculation method is the dependency-based frequency, which is an augmentation of the previous method. Inspired by a work on the lexical normalization of Tweets (Han &amp; Baldwin, 2011), the noisy portion of our training dataset is parsed to obtain a dependency bank using our adapted version of the Stanford dependency parser (Marneffe, MacCartney, &amp; Manning, 2006). Since our aim is not to perform actual dependency parsing, the dependency types are not extracted. A cleansed corpus is not utilized because the percentage of IV words is high enough in the corpus, and in the probabilitymeasuring phase, OOV words are already detected. For example, from a sentence such as “I will go to London by next week,” (next, go +3) is obtained, indicating that next appears two words after go. The aggregations of all the dependency scores, which are called confidence scores, are stored in the dependency bank. A five-gram dependency bank is prepared without using a root </context>
</contexts>
<marker>Marneffe, MacCartney, Manning, 2006</marker>
<rawString>Marneffe, M.-C. de, MacCartney, B., &amp; Manning, C. D. (2006). Generating typed dependency parses from phrase structure parses. In The International Conference on Language Resources and Evaluation (LREC) (pp. 449–454). Genova, Italy.</rawString>
</citation>
<citation valid="true">
<authors>
<author>E Mays</author>
<author>F J Damerau</author>
<author>R L Mercer</author>
</authors>
<title>Context based spelling correction.</title>
<date>1991</date>
<journal>Information Processing &amp; Management,</journal>
<volume>27</volume>
<issue>5</issue>
<pages>517--522</pages>
<contexts>
<context position="20297" citStr="Mays, Damerau, &amp; Mercer, 1991" startWordPosition="3172" endWordPosition="3176"> a language model. The cleansed part of our training dataset, which consists of more than 55,000 words, is fed into SRILM (Stolcke, 2002) to compile a bidirectional trigram LM by employing the Kneser-Ney smoothing algorithm. To calculate the probability of each candidate, we used a beam search decoder through the Moses decoder (Koehn et al., 2007). 5.2 Selecting the most probable candidate Previous works on spelling correction and normalization used the source channel model, which is also known as the noisy channel model and Naïve Bayes (Beaufort et al., 2010; Kernighan, Church, &amp; Gale, 1990; Mays, Damerau, &amp; Mercer, 1991; Toutanova &amp; Moore, 2002; Xue, Yin, &amp; Davison, 2011b). In the noisy channel approach, we observe the conversion of standard words to noisy words in a training phase in order to build a model. In the prediction phase, the decoder can select the most probable candidate based on the obtained model. The candidate selection is accomplished based on only two parameters: the LM and error model, which is computed as follows: G = arg max{ P(T |O )} = arg max Where T is a target word, 0 is an observed word, fm (T, O) is a feature function, M is a number of total feature functions, and X is a Lagrange m</context>
</contexts>
<marker>Mays, Damerau, Mercer, 1991</marker>
<rawString>Mays, E., Damerau, F. J., &amp; Mercer, R. L. (1991). Context based spelling correction. Information Processing &amp; Management, 27(5), 517–522.</rawString>
</citation>
<citation valid="true">
<authors>
<author>J Novak</author>
<author>D Yang</author>
<author>N Minematsu</author>
<author>K Hirose</author>
</authors>
<title>Phonetisaurus: A wfst-driven phoneticizer.</title>
<date>2011</date>
<institution>The University of Tokyo, Tokyo Institute of Technology. Retrieved</institution>
<note>from http://code.google.com/p/phonetisaurus/</note>
<contexts>
<context position="14031" citStr="Novak, Yang, Minematsu, &amp; Hirose, 2011" startWordPosition="2142" endWordPosition="2147">aer — aaer, baer, caer, daer, eaer, faer, gaer, haer, etc.), which generates 26(n + 1) combinations. Finally, from the achieved combinations, standard words will be selected using the Aspell dictionary. However, many OOV words in Twitter are quite far from their target in term of edit distance especially in terms of deletions and substitutions. Therefore, we generated more candidates via three other methods. Similar to the speech recognition systems, the second module generates candidates based on phoneme sounds. First, grapheme to phoneme conversion is performed using the Phonetisaurus tool (Novak, Yang, Minematsu, &amp; Hirose, 2011). Phonetisaurus is an open-source phonetizer that is designed in the form of a weighted finite state transducer (WFST). After selecting the 10 best phoneme sequences, it looks up the phonemes in a pronouncing dictionary – Carnegie Mellon University (CMU) dictionary. The CMU is a machine-readable pronunciation dictionary that contains over 134,000 words including OOV words such as proper nouns and acronyms. Due to the existence of a large number of OOV words in the CMU dictionary, we filter out the OOVs using the Aspell dictionary. The third module, as proposed by Saloot, Idris, and Aw (2014),</context>
</contexts>
<marker>Novak, Yang, Minematsu, Hirose, 2011</marker>
<rawString>Novak, J., Yang, D., Minematsu, N., &amp; Hirose, K. (2011). Phonetisaurus: A wfst-driven phoneticizer. The University of Tokyo, Tokyo Institute of Technology. Retrieved January 1, 2014, from http://code.google.com/p/phonetisaurus/</rawString>
</citation>
<citation valid="true">
<authors>
<author>F J Och</author>
<author>H Ney</author>
</authors>
<title>Discriminative Training and Maximum Entropy Models for Statistical Machine Translation.</title>
<date>2002</date>
<booktitle>In Proceedings of the 40th Annual Meeting on Association for Computational Linguistics</booktitle>
<pages>295--302</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA:</location>
<contexts>
<context position="4870" citStr="Och &amp; Ney, 2002" startWordPosition="729" endWordPosition="732">he normalization bears a resemblance to spelling correction. The ultimate goal of which is the detection and correction of OOV words. The spelling correction methods only focus on misspelled words while normalization systems consider all forms of OOV words, such as representing sounds phonetically (e.g. by the way — btw) and shortened forms (e.g. university — uni). Thus, normalization approaches should address a higher volume of OOV words compared to spelling correction approaches that lead to more complexity. To address this complexity, we use maximum entropy (Berger, Pietra, &amp; Pietra, 1996; Och &amp; Ney, 2002) for utilizing and incorporating more probability functions. Our approach is based on the hypothesis that integrating more probability functions will boost the performance of the method; however, the available information and number of probability functions for (OOV word, standard word) pairs are always limited. Maximum entropy (Maxent) provides a criterion for integrating probability distributions based on partial knowledge. The Maxent produces the lowest biased estimation on the given information, that is, it is maximally neutral regarding missing information. When defining some unknown even</context>
</contexts>
<marker>Och, Ney, 2002</marker>
<rawString>Och, F. J., &amp; Ney, H. (2002). Discriminative Training and Maximum Entropy Models for Statistical Machine Translation. In Proceedings of the 40th Annual Meeting on Association for Computational Linguistics (pp. 295–302). Stroudsburg, PA, USA: Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>F J Och</author>
<author>H Ney</author>
</authors>
<title>A Systematic Comparison of Various Statistical Alignment Models.</title>
<date>2003</date>
<journal>Comput. Linguist.,</journal>
<volume>29</volume>
<issue>1</issue>
<contexts>
<context position="23630" citStr="Och &amp; Ney, 2003" startWordPosition="3748" endWordPosition="3751"> cross validation BLEU score Round 1 80.99 Round 2 81.57 Round 3 84.82 Round 4 83.91 Round 5 83.90 Round 6 83.55 Average 83.12 Table 3: Normalization results for 6-fold cross validation test. The statistical machine translation (SMT) is a cutting-edge approach that handles the normalization problem as a statistical machine translation task; it was first introduced by Aw, Zhang, Xiao, and Su (2006). The SMT-like approach translates a source language (UGC) to a target language (standard language). The experiment was performed using Moses (Koehn et al., 2007) for statistical translation, Giza++ (Och &amp; Ney, 2003) for word alignment, and SRILM (Stolcke, 2002) for LM compiling. The SMT system is trained using our Twitter aligned dataset. The optimum results were achieved using a trigram LM and Backoff smoothing (Jelinek, 1990): 78.81 BLEU score. Table 4 indicates some statistics about our testing dataset. The OOV words are those detected by our OOV detection module. The BLEU score of raw text is an important measure to analyze the difficulty of the task. It is important to note that the dataset used in our experiment contains an above average number of OOV words compared to the datasets in other related</context>
</contexts>
<marker>Och, Ney, 2003</marker>
<rawString>Och, F. J., &amp; Ney, H. (2003). A Systematic Comparison of Various Statistical Alignment Models. Comput. Linguist., 29(1), 19–51.</rawString>
</citation>
<citation valid="true">
<authors>
<author>J Oliva</author>
<author>J I Serrano</author>
<author>Del Castillo</author>
<author>M D</author>
<author>Á Igesias</author>
</authors>
<title>A SMS Normalization System Integrating Multiple Grammatical Resources. Natural Language Engineering,</title>
<date>2013</date>
<pages>121--141</pages>
<marker>Oliva, Serrano, Castillo, D, Igesias, 2013</marker>
<rawString>Oliva, J., Serrano, J. I., Del Castillo, M. D., &amp; Igesias, Á. (2013). A SMS Normalization System Integrating Multiple Grammatical Resources. Natural Language Engineering, 19(01), 121–141.</rawString>
</citation>
<citation valid="true">
<authors>
<author>K Papineni</author>
<author>S Roukos</author>
<author>T Ward</author>
</authors>
<title>Maximum likelihood and discriminative training of direct translation models.</title>
<date>1998</date>
<booktitle>In Acoustics, Speech and Signal Processing,</booktitle>
<volume>1</volume>
<pages>189--192</pages>
<contexts>
<context position="21753" citStr="Papineni, Roukos, &amp; Ward, 1998" startWordPosition="3425" endWordPosition="3429">g phase before the actual usage. 6 Experimental results and discussion We evaluate our approach in terms of BLEU score (Papineni, Roukos, Ward, &amp; Zhu, 2002), since BLEU has become a well-known and adequate evaluation metric in normalization studies (Contractor, Faruquie, &amp; Subramaniam, 2010; Schlippe, Zhu, Gebhardt, &amp; Schultz, 2010). The achieved baseline for the testing dataset is 42.01 BLEU score, that is, the volume of similarity between the testing text and the reference text (manually normalized text) in term of BLEU score. In the training phase, we performed maximum likelihood training (Papineni, Roukos, &amp; Ward, 1998; Streit &amp; Luginbuhl, 1994) for X1, X2 and X3 between 0.0 and 1.0. Figure 1 shows the tolerance of the performance while transition of X1 and X2 (when X3 is fixed to 1.0). Figure 1 depicts that the value of performance achieves the high(T, O) Y M ∑ Am •fm m =1 23 est when the X1 and X2 are close to 0.63 and 0.9, respectively. It is found that the best performance is achieved by 0.6, 0.9, and 1.0 values for X1, X2, and X3, respectively. This means that LM has the highest impact on the candidate selection, and that dependency-based frequency has a higher impact on candidate selection than pos</context>
</contexts>
<marker>Papineni, Roukos, Ward, 1998</marker>
<rawString>Papineni, K., Roukos, S., &amp; Ward, T. (1998). Maximum likelihood and discriminative training of direct translation models. In Acoustics, Speech and Signal Processing, 1998. Proceedings of the 1998 IEEE International Conference on (Vol. 1, pp. 189–192 vol.1).</rawString>
</citation>
<citation valid="true">
<authors>
<author>K Papineni</author>
<author>S Roukos</author>
<author>T Ward</author>
<author>W-J Zhu</author>
</authors>
<title>BLEU: A Method for Automatic Evaluation of Machine Translation.</title>
<date>2002</date>
<booktitle>In Proceedings of the 40th Annual Meeting on Association for Computational Linguistics</booktitle>
<pages>311--318</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA:</location>
<contexts>
<context position="21278" citStr="Papineni, Roukos, Ward, &amp; Zhu, 2002" startWordPosition="3352" endWordPosition="3357"> only two parameters: the LM and error model, which is computed as follows: G = arg max{ P(T |O )} = arg max Where T is a target word, 0 is an observed word, fm (T, O) is a feature function, M is a number of total feature functions, and X is a Lagrange multiplier of each function. In our case, M equals three, in which f1 is the positional indexing, f2 is the dependency-based frequency feature, and f3 is the LM probability. The Maxent requires X being determined in the training phase before the actual usage. 6 Experimental results and discussion We evaluate our approach in terms of BLEU score (Papineni, Roukos, Ward, &amp; Zhu, 2002), since BLEU has become a well-known and adequate evaluation metric in normalization studies (Contractor, Faruquie, &amp; Subramaniam, 2010; Schlippe, Zhu, Gebhardt, &amp; Schultz, 2010). The achieved baseline for the testing dataset is 42.01 BLEU score, that is, the volume of similarity between the testing text and the reference text (manually normalized text) in term of BLEU score. In the training phase, we performed maximum likelihood training (Papineni, Roukos, &amp; Ward, 1998; Streit &amp; Luginbuhl, 1994) for X1, X2 and X3 between 0.0 and 1.0. Figure 1 shows the tolerance of the performance while tran</context>
</contexts>
<marker>Papineni, Roukos, Ward, Zhu, 2002</marker>
<rawString>Papineni, K., Roukos, S., Ward, T., &amp; Zhu, W.-J. (2002). BLEU: A Method for Automatic Evaluation of Machine Translation. In Proceedings of the 40th Annual Meeting on Association for Computational Linguistics (pp. 311–318). Stroudsburg, PA, USA: Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>D L Pennell</author>
<author>Y Liu</author>
</authors>
<title>Normalization of text messages for text-to-speech. Acoustics Speech and Signal Processing (ICASSP),</title>
<date>2010</date>
<booktitle>IEEE International Conference on.</booktitle>
<contexts>
<context position="5655" citStr="Pennell and Liu (2010)" startWordPosition="845" endWordPosition="848">formance of the method; however, the available information and number of probability functions for (OOV word, standard word) pairs are always limited. Maximum entropy (Maxent) provides a criterion for integrating probability distributions based on partial knowledge. The Maxent produces the lowest biased estimation on the given information, that is, it is maximally neutral regarding missing information. When defining some unknown events with a statistical model, we should always select the one that has maximum entropy. Although the Maxent has already been used in the normalization sphere (e.g. Pennell and Liu (2010) utilized Maxent to classify deletion-based abbreviations), this paper explains how to employ Maxent for selecting the best-normalized candidate. We have developed a method that does not require annotated training data and it normalizes unseen data. Most of the normalization approaches substantially depend on the manually annotated data, while the labeled data is costly and time consuming to prepare. We generate normalized candidates for each detected OOV based on lexical, phonemic, and morphophonemic variations. In addition, since our target dataset encompasses Twitter messages from Singapore</context>
</contexts>
<marker>Pennell, Liu, 2010</marker>
<rawString>Pennell, D. L., &amp; Liu, Y. (2010). Normalization of text messages for text-to-speech. Acoustics Speech and Signal Processing (ICASSP), 2010 IEEE International Conference on.</rawString>
</citation>
<citation valid="true">
<authors>
<author>M A Saloot</author>
<author>N Idris</author>
<author>A Aw</author>
</authors>
<title>Noisy Text Normalization Using an Enhanced Language Model.</title>
<date>2014</date>
<booktitle>In Proceedings of the International Conference on Artificial Intelligence and Pattern Recognition (pp. 111–122). Kuala Lumpur,</booktitle>
<publisher>SDIWC.</publisher>
<location>Malaysia:</location>
<marker>Saloot, Idris, Aw, 2014</marker>
<rawString>Saloot, M. A., Idris, N., &amp; Aw, A. (2014). Noisy Text Normalization Using an Enhanced Language Model. In Proceedings of the International Conference on Artificial Intelligence and Pattern Recognition (pp. 111–122). Kuala Lumpur, Malaysia: SDIWC.</rawString>
</citation>
<citation valid="true">
<authors>
<author>M A Saloot</author>
<author>N Idris</author>
<author>A Aw</author>
<author>D Thorleuchter</author>
</authors>
<title>Twitter corpus creation: The case of a Malay Chat-style-text Corpus (MCC).</title>
<date>2014</date>
<booktitle>Digital Scholarship in the Humanities. Retrieved from http://dsh.oxfordjournals.org/content/early/2014/12 /13/llc.fqu066.abstract</booktitle>
<contexts>
<context position="16204" citStr="Saloot, Idris, Aw, &amp; Thorleuchter, 2014" startWordPosition="2488" endWordPosition="2493">l edit 70 distance Combination 50 Phoneme 20 Malay dictionary 3 Table 1: The average number of generated candidates for five letter words. 5 Candidate selection The main contribution of this work is to present a novel candidate selection method. The candidate selection stage consists of two steps: 1) assigning a variety of probability scores to candidates, and 2) integrating probability scores to select the best candidate. Our candidate selection method requires a training dataset. The training and testing datasets are collected from an extensive English Twitter corpus posted by Singaporeans (Saloot, Idris, Aw, &amp; Thorleuchter, 2014). Three linguistic experts manually normalized 7,000 Tweets, while using inter-normalization agreement as an indicator. The experts were instructed to produce a text that is as close to standard English as possible, but leaves the Twitter special symbols (e.g. #topic and @username) as is. The dataset was split into two parts: 5,000 messages for the training phase, and 2,000 messages for the testing phase. 5.1 Calculation of probability scores In order to select the most suitable candidates, we calculate their conditional probability scores using, positional indexing, a dependency-based freque</context>
</contexts>
<marker>Saloot, Idris, Aw, Thorleuchter, 2014</marker>
<rawString>Saloot, M. A., Idris, N., Aw, A., &amp; Thorleuchter, D. (2014). Twitter corpus creation: The case of a Malay Chat-style-text Corpus (MCC). Digital Scholarship in the Humanities. Retrieved from http://dsh.oxfordjournals.org/content/early/2014/12 /13/llc.fqu066.abstract</rawString>
</citation>
<citation valid="true">
<authors>
<author>M A Saloot</author>
<author>N Idris</author>
<author>R Mahmud</author>
</authors>
<title>An architecture for Malay Tweet normalization.</title>
<date>2014</date>
<journal>Information Processing &amp; Management,</journal>
<volume>50</volume>
<issue>5</issue>
<pages>621--633</pages>
<contexts>
<context position="8752" citStr="Saloot, Idris, &amp; Mahmud, 2014" startWordPosition="1316" endWordPosition="1320">probabilistic models for only three common abbreviation types: stylistic variation, prefix clipping, and subsequence abbreviation. In addition, Beaufort, Roekhaut, Cougnon, and Fairon (2010) merged 20 the SMT-like and the spell checking approaches to normalize French SMSs. The third group is the dictionary based normalization approach, which is an easy-to-use and fast solution. This approach requires a dictionary whose entries are OOV and standard form pairs. It has been proven that using a colloquial dictionary can outperform some state-of-the-art and complex approaches (Clark &amp; Araki, 2011; Saloot, Idris, &amp; Mahmud, 2014). However, its performance highly relies on the size of the dictionary. Therefore, Han, Cook, and Baldwin (2012) introduced a method to automatically compile a large dictionary. To address the shortcomings of the dictionary approach, Oliva, Serrano, Del Castillo, and Igesias (2013) introduced a special Spanish phonetic dictionary, in which each entry is formed by a coded consonant string, vowels strings, and their positions in the word, for normalizing Spanish SMS texts. The fourth group resembles automatic speech recognition (ASR) systems. This paradigm consists of three steps: 1) converting</context>
</contexts>
<marker>Saloot, Idris, Mahmud, 2014</marker>
<rawString>Saloot, M. A., Idris, N., &amp; Mahmud, R. (2014). An architecture for Malay Tweet normalization. Information Processing &amp; Management, 50(5), 621– 633.</rawString>
</citation>
<citation valid="true">
<authors>
<author>T Schlippe</author>
<author>C Zhu</author>
<author>J Gebhardt</author>
<author>T Schultz</author>
</authors>
<title>Text normalization based on statistical machine translation and internet user support. In</title>
<date>2010</date>
<booktitle>INTERSPEECH</booktitle>
<pages>1816--1819</pages>
<contexts>
<context position="21456" citStr="Schlippe, Zhu, Gebhardt, &amp; Schultz, 2010" startWordPosition="3377" endWordPosition="3382">feature function, M is a number of total feature functions, and X is a Lagrange multiplier of each function. In our case, M equals three, in which f1 is the positional indexing, f2 is the dependency-based frequency feature, and f3 is the LM probability. The Maxent requires X being determined in the training phase before the actual usage. 6 Experimental results and discussion We evaluate our approach in terms of BLEU score (Papineni, Roukos, Ward, &amp; Zhu, 2002), since BLEU has become a well-known and adequate evaluation metric in normalization studies (Contractor, Faruquie, &amp; Subramaniam, 2010; Schlippe, Zhu, Gebhardt, &amp; Schultz, 2010). The achieved baseline for the testing dataset is 42.01 BLEU score, that is, the volume of similarity between the testing text and the reference text (manually normalized text) in term of BLEU score. In the training phase, we performed maximum likelihood training (Papineni, Roukos, &amp; Ward, 1998; Streit &amp; Luginbuhl, 1994) for X1, X2 and X3 between 0.0 and 1.0. Figure 1 shows the tolerance of the performance while transition of X1 and X2 (when X3 is fixed to 1.0). Figure 1 depicts that the value of performance achieves the high(T, O) Y M ∑ Am •fm m =1 23 est when the X1 and X2 are close to </context>
</contexts>
<marker>Schlippe, Zhu, Gebhardt, Schultz, 2010</marker>
<rawString>Schlippe, T., Zhu, C., Gebhardt, J., &amp; Schultz, T. (2010). Text normalization based on statistical machine translation and internet user support. In T. Kobayashi, K. Hirose, &amp; S. Nakamura (Eds.), INTERSPEECH (pp. 1816–1819). ISCA.</rawString>
</citation>
<citation valid="true">
<authors>
<author>J Smith</author>
<author>P Padi</author>
</authors>
<title>Lets make a dictionary.</title>
<date>2006</date>
<booktitle>In Proceedings of the the Eighth Biennial Conference of the Borneo Research Council (BRC)</booktitle>
<pages>515--520</pages>
<institution>Sarawak, Malaysia: Borneo Research Council (BRC).</institution>
<contexts>
<context position="15116" citStr="Smith &amp; Padi, 2006" startWordPosition="2321" endWordPosition="2324">the CMU dictionary, we filter out the OOVs using the Aspell dictionary. The third module, as proposed by Saloot, Idris, and Aw (2014), is a combination of the two previous modules. First, it lexically generates candidates within one edit distance of the given OOV word, and then sends the candidates to the phoneme module. Since our testing dataset consists of English Tweets posted by Singaporeans, code-switching between Malay and English is frequent in the text. Therefore, our last module translates OOV words to English (if any). We searched for the tokens in the Smith MalayEnglish Dictionary (Smith &amp; Padi, 2006), and inserted the meanings in the candidate set. Table 1 displays the average number of generated candidates for each module. The lowest rate is associated with the Malay dictionary module. Two lexical edit operations generate the highest number of candidates, which indicates the highest recall and lowest precision. The rank of combination and phoneme modules are second and third, respectively. Io. module Average number of candidates Two lexical edit 70 distance Combination 50 Phoneme 20 Malay dictionary 3 Table 1: The average number of generated candidates for five letter words. 5 Candidate </context>
</contexts>
<marker>Smith, Padi, 2006</marker>
<rawString>Smith, J., &amp; Padi, P. (2006). Lets make a dictionary. In Proceedings of the the Eighth Biennial Conference of the Borneo Research Council (BRC) (pp. 515–520). Sarawak, Malaysia: Borneo Research Council (BRC).</rawString>
</citation>
<citation valid="true">
<authors>
<author>R Sproat</author>
<author>A W Black</author>
<author>S Chen</author>
<author>S Kumar</author>
<author>M Ostendorf</author>
<author>C Richards</author>
</authors>
<title>Normalization of non-standard words.</title>
<date>2001</date>
<journal>Computer Speech &amp; Language,</journal>
<volume>15</volume>
<issue>3</issue>
<pages>287--333</pages>
<contexts>
<context position="3534" citStr="Sproat et al., 2001" startWordPosition="518" endWordPosition="521">he small keypads, and 3) using UGC in informal communications between friends and relatives. Whatever their causes, these alterations considerably affect any standard natural language processing (NLP) system, due to the presence of many out of vocabulary (OOV) words, also known as non-standard words (NSWs) and unknown words. Therefore, a text normalization process must be performed before any conven19 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 19–27, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics tional NLP process is implemented (Sproat et al., 2001). As defined by Liu, Weng, Wang, and Liu (2011), “Text message normalization aims to replace the non-standard tokens that carry significant meanings with the context-appropriate standard words.” This paper proposes a novel normalization approach for Twitter messages. Twitter is the most popular microblogging service in the world for news-casting, sharing thoughts, and staying in touch with friends. Since its initial founding in 2006, it has gathered hundreds of millions of registered users. Tweets refer to messages sent on Twitter, which is restricted to 140 characters, 20 characters less than</context>
</contexts>
<marker>Sproat, Black, Chen, Kumar, Ostendorf, Richards, 2001</marker>
<rawString>Sproat, R., Black, A. W., Chen, S., Kumar, S., Ostendorf, M., &amp; Richards, C. (2001). Normalization of non-standard words. Computer Speech &amp; Language, 15(3), 287–333.</rawString>
</citation>
<citation valid="true">
<authors>
<author>A Stolcke</author>
</authors>
<title>SRILM-an extensible language modeling toolkit.</title>
<date>2002</date>
<booktitle>In Proceedings International Conference on Spoken Language Processing</booktitle>
<pages>257--286</pages>
<contexts>
<context position="19805" citStr="Stolcke, 2002" startWordPosition="3094" endWordPosition="3095">iterated for all words in the sentence. A probability score between 0.0 and 1.0 is assigned to each candidate. A relative position score in the form of (candidate word, context word, position) is calculated for each candidate within a context window of two words on either side. The obtained relative position of a candidate is compared with the existing confidence score in the dependency bank. The third method of probability measurement calculates the probabilities based on a language model. The cleansed part of our training dataset, which consists of more than 55,000 words, is fed into SRILM (Stolcke, 2002) to compile a bidirectional trigram LM by employing the Kneser-Ney smoothing algorithm. To calculate the probability of each candidate, we used a beam search decoder through the Moses decoder (Koehn et al., 2007). 5.2 Selecting the most probable candidate Previous works on spelling correction and normalization used the source channel model, which is also known as the noisy channel model and Naïve Bayes (Beaufort et al., 2010; Kernighan, Church, &amp; Gale, 1990; Mays, Damerau, &amp; Mercer, 1991; Toutanova &amp; Moore, 2002; Xue, Yin, &amp; Davison, 2011b). In the noisy channel approach, we observe the conver</context>
<context position="23676" citStr="Stolcke, 2002" startWordPosition="3757" endWordPosition="3758"> 2 81.57 Round 3 84.82 Round 4 83.91 Round 5 83.90 Round 6 83.55 Average 83.12 Table 3: Normalization results for 6-fold cross validation test. The statistical machine translation (SMT) is a cutting-edge approach that handles the normalization problem as a statistical machine translation task; it was first introduced by Aw, Zhang, Xiao, and Su (2006). The SMT-like approach translates a source language (UGC) to a target language (standard language). The experiment was performed using Moses (Koehn et al., 2007) for statistical translation, Giza++ (Och &amp; Ney, 2003) for word alignment, and SRILM (Stolcke, 2002) for LM compiling. The SMT system is trained using our Twitter aligned dataset. The optimum results were achieved using a trigram LM and Backoff smoothing (Jelinek, 1990): 78.81 BLEU score. Table 4 indicates some statistics about our testing dataset. The OOV words are those detected by our OOV detection module. The BLEU score of raw text is an important measure to analyze the difficulty of the task. It is important to note that the dataset used in our experiment contains an above average number of OOV words compared to the datasets in other related papers. The dataset used by Kobus et al. (200</context>
</contexts>
<marker>Stolcke, 2002</marker>
<rawString>Stolcke, A. (2002). SRILM-an extensible language modeling toolkit. In Proceedings International Conference on Spoken Language Processing (pp. 257–286).</rawString>
</citation>
<citation valid="true">
<authors>
<author>R L Streit</author>
<author>T E Luginbuhl</author>
</authors>
<title>Maximum likelihood training of probabilistic neural networks.</title>
<date>1994</date>
<journal>Neural Networks, IEEE Transactions on,</journal>
<volume>5</volume>
<issue>5</issue>
<pages>764--783</pages>
<contexts>
<context position="21780" citStr="Streit &amp; Luginbuhl, 1994" startWordPosition="3430" endWordPosition="3433"> 6 Experimental results and discussion We evaluate our approach in terms of BLEU score (Papineni, Roukos, Ward, &amp; Zhu, 2002), since BLEU has become a well-known and adequate evaluation metric in normalization studies (Contractor, Faruquie, &amp; Subramaniam, 2010; Schlippe, Zhu, Gebhardt, &amp; Schultz, 2010). The achieved baseline for the testing dataset is 42.01 BLEU score, that is, the volume of similarity between the testing text and the reference text (manually normalized text) in term of BLEU score. In the training phase, we performed maximum likelihood training (Papineni, Roukos, &amp; Ward, 1998; Streit &amp; Luginbuhl, 1994) for X1, X2 and X3 between 0.0 and 1.0. Figure 1 shows the tolerance of the performance while transition of X1 and X2 (when X3 is fixed to 1.0). Figure 1 depicts that the value of performance achieves the high(T, O) Y M ∑ Am •fm m =1 23 est when the X1 and X2 are close to 0.63 and 0.9, respectively. It is found that the best performance is achieved by 0.6, 0.9, and 1.0 values for X1, X2, and X3, respectively. This means that LM has the highest impact on the candidate selection, and that dependency-based frequency has a higher impact on candidate selection than positional. 0.9 0.8 0.7 0.6 0.</context>
</contexts>
<marker>Streit, Luginbuhl, 1994</marker>
<rawString>Streit, R. L., &amp; Luginbuhl, T. E. (1994). Maximum likelihood training of probabilistic neural networks. Neural Networks, IEEE Transactions on, 5(5), 764–783. doi:10.1109/72.317728</rawString>
</citation>
<citation valid="true">
<authors>
<author>C Thurlow</author>
<author>A Brown</author>
</authors>
<title>Generation Txt? The sociolinguistics of young people’s textmessaging.</title>
<date>2003</date>
<contexts>
<context position="2470" citStr="Thurlow &amp; Brown, 2003" startWordPosition="348" endWordPosition="351"> Research (I2R), A*STAR, Singapore aaiti@i2r.a-star.edu.sg ous well-known normalization approaches. 1 Introduction The advent of Web 2.0 and electronic communications has enabled the extensive creation and dissemination of user-generated content (UGC). The UGC collections provide invaluable data sources in order to mine and extract beneficial information and knowledge, while, at the same time, resulting in less standardized language (Clark &amp; Araki, 2011; Daugherty, Eastin, &amp; Bright, 2008). However, such content diverges from standard writing conventions. As shown by experts (Bieswanger, 2007; Thurlow &amp; Brown, 2003), this divergence is due to the usage of a variety of coding strategies, including digit phonemes (you too — you2), phonetic transcriptions (you — u), vowel drops (dinner — dnnr), misspellings (convenience — convineince), and missing or incorrect punctuation marks (If I were you, I&apos;d probably go. — If I were you Id probably go). These alterations are due to three main parameters: 1) The small allowance of characters, 2) the constraints of the small keypads, and 3) using UGC in informal communications between friends and relatives. Whatever their causes, these alterations considerably affect an</context>
</contexts>
<marker>Thurlow, Brown, 2003</marker>
<rawString>Thurlow, C., &amp; Brown, A. (2003). Generation Txt? The sociolinguistics of young people’s textmessaging.</rawString>
</citation>
<citation valid="true">
<authors>
<author>K Toutanova</author>
<author>R C Moore</author>
</authors>
<title>Pronunciation Modeling for Improved Spelling Correction.</title>
<date>2002</date>
<booktitle>In Proceedings of the 40th Annual Meeting on Association for Computational Linguistics</booktitle>
<pages>144--151</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA:</location>
<contexts>
<context position="20322" citStr="Toutanova &amp; Moore, 2002" startWordPosition="3177" endWordPosition="3180"> part of our training dataset, which consists of more than 55,000 words, is fed into SRILM (Stolcke, 2002) to compile a bidirectional trigram LM by employing the Kneser-Ney smoothing algorithm. To calculate the probability of each candidate, we used a beam search decoder through the Moses decoder (Koehn et al., 2007). 5.2 Selecting the most probable candidate Previous works on spelling correction and normalization used the source channel model, which is also known as the noisy channel model and Naïve Bayes (Beaufort et al., 2010; Kernighan, Church, &amp; Gale, 1990; Mays, Damerau, &amp; Mercer, 1991; Toutanova &amp; Moore, 2002; Xue, Yin, &amp; Davison, 2011b). In the noisy channel approach, we observe the conversion of standard words to noisy words in a training phase in order to build a model. In the prediction phase, the decoder can select the most probable candidate based on the obtained model. The candidate selection is accomplished based on only two parameters: the LM and error model, which is computed as follows: G = arg max{ P(T |O )} = arg max Where T is a target word, 0 is an observed word, fm (T, O) is a feature function, M is a number of total feature functions, and X is a Lagrange multiplier of each functio</context>
</contexts>
<marker>Toutanova, Moore, 2002</marker>
<rawString>Toutanova, K., &amp; Moore, R. C. (2002). Pronunciation Modeling for Improved Spelling Correction. In Proceedings of the 40th Annual Meeting on Association for Computational Linguistics (pp. 144– 151). Stroudsburg, PA, USA: Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Z Xue</author>
<author>D Yin</author>
<author>B D Davison</author>
</authors>
<title>Normalizing Microtext.</title>
<date>2011</date>
<booktitle>In Analyzing Microtext (Vol. WS11–05).</booktitle>
<publisher>AAAI.</publisher>
<contexts>
<context position="8018" citStr="Xue, Yin, &amp; Davison, 2011" startWordPosition="1208" endWordPosition="1212">Lopez Ludeña, San Segundo, Montero, Barra Chicote, &amp; Lorenzo, 2012). For example, Kaufmann and Kalita (2010) used the SMT-like approach to normalize English Tweets. To normalize SMS language, a supervised noisy channel model was introduced by Choudhury, Saraf, Jain, Sarkar, and Basu (2007) that used a hidden Markov model (HMM). This approach mimics the spell checking task that tries to handle the normalization problem via noisy channel models that study the UGC text as a noisy version of standard language. This paradigm has been scrutinized and enhanced by other researchers (Liu et al., 2011; Xue, Yin, &amp; Davison, 2011a). For example, Cook and Stevenson (2009) modified this approach to design an unsupervised method using probabilistic models for only three common abbreviation types: stylistic variation, prefix clipping, and subsequence abbreviation. In addition, Beaufort, Roekhaut, Cougnon, and Fairon (2010) merged 20 the SMT-like and the spell checking approaches to normalize French SMSs. The third group is the dictionary based normalization approach, which is an easy-to-use and fast solution. This approach requires a dictionary whose entries are OOV and standard form pairs. It has been proven that using a</context>
<context position="20349" citStr="Xue, Yin, &amp; Davison, 2011" startWordPosition="3181" endWordPosition="3185">aset, which consists of more than 55,000 words, is fed into SRILM (Stolcke, 2002) to compile a bidirectional trigram LM by employing the Kneser-Ney smoothing algorithm. To calculate the probability of each candidate, we used a beam search decoder through the Moses decoder (Koehn et al., 2007). 5.2 Selecting the most probable candidate Previous works on spelling correction and normalization used the source channel model, which is also known as the noisy channel model and Naïve Bayes (Beaufort et al., 2010; Kernighan, Church, &amp; Gale, 1990; Mays, Damerau, &amp; Mercer, 1991; Toutanova &amp; Moore, 2002; Xue, Yin, &amp; Davison, 2011b). In the noisy channel approach, we observe the conversion of standard words to noisy words in a training phase in order to build a model. In the prediction phase, the decoder can select the most probable candidate based on the obtained model. The candidate selection is accomplished based on only two parameters: the LM and error model, which is computed as follows: G = arg max{ P(T |O )} = arg max Where T is a target word, 0 is an observed word, fm (T, O) is a feature function, M is a number of total feature functions, and X is a Lagrange multiplier of each function. In our case, M equals th</context>
</contexts>
<marker>Xue, Yin, Davison, 2011</marker>
<rawString>Xue, Z., Yin, D., &amp; Davison, B. D. (2011a). Normalizing Microtext. In Analyzing Microtext (Vol. WS11–05). AAAI.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Z Xue</author>
<author>D Yin</author>
<author>B D Davison</author>
</authors>
<title>Normalizing Microtext.</title>
<date>2011</date>
<booktitle>In Analyzing Microtext: Papers from the 2011 AAAI Workshop</booktitle>
<pages>74--79</pages>
<publisher>AAAI.</publisher>
<location>San Francisco, CA, USA:</location>
<contexts>
<context position="8018" citStr="Xue, Yin, &amp; Davison, 2011" startWordPosition="1208" endWordPosition="1212">Lopez Ludeña, San Segundo, Montero, Barra Chicote, &amp; Lorenzo, 2012). For example, Kaufmann and Kalita (2010) used the SMT-like approach to normalize English Tweets. To normalize SMS language, a supervised noisy channel model was introduced by Choudhury, Saraf, Jain, Sarkar, and Basu (2007) that used a hidden Markov model (HMM). This approach mimics the spell checking task that tries to handle the normalization problem via noisy channel models that study the UGC text as a noisy version of standard language. This paradigm has been scrutinized and enhanced by other researchers (Liu et al., 2011; Xue, Yin, &amp; Davison, 2011a). For example, Cook and Stevenson (2009) modified this approach to design an unsupervised method using probabilistic models for only three common abbreviation types: stylistic variation, prefix clipping, and subsequence abbreviation. In addition, Beaufort, Roekhaut, Cougnon, and Fairon (2010) merged 20 the SMT-like and the spell checking approaches to normalize French SMSs. The third group is the dictionary based normalization approach, which is an easy-to-use and fast solution. This approach requires a dictionary whose entries are OOV and standard form pairs. It has been proven that using a</context>
<context position="20349" citStr="Xue, Yin, &amp; Davison, 2011" startWordPosition="3181" endWordPosition="3185">aset, which consists of more than 55,000 words, is fed into SRILM (Stolcke, 2002) to compile a bidirectional trigram LM by employing the Kneser-Ney smoothing algorithm. To calculate the probability of each candidate, we used a beam search decoder through the Moses decoder (Koehn et al., 2007). 5.2 Selecting the most probable candidate Previous works on spelling correction and normalization used the source channel model, which is also known as the noisy channel model and Naïve Bayes (Beaufort et al., 2010; Kernighan, Church, &amp; Gale, 1990; Mays, Damerau, &amp; Mercer, 1991; Toutanova &amp; Moore, 2002; Xue, Yin, &amp; Davison, 2011b). In the noisy channel approach, we observe the conversion of standard words to noisy words in a training phase in order to build a model. In the prediction phase, the decoder can select the most probable candidate based on the obtained model. The candidate selection is accomplished based on only two parameters: the LM and error model, which is computed as follows: G = arg max{ P(T |O )} = arg max Where T is a target word, 0 is an observed word, fm (T, O) is a feature function, M is a number of total feature functions, and X is a Lagrange multiplier of each function. In our case, M equals th</context>
</contexts>
<marker>Xue, Yin, Davison, 2011</marker>
<rawString>Xue, Z., Yin, D., &amp; Davison, B. D. (2011b). Normalizing Microtext. In Analyzing Microtext: Papers from the 2011 AAAI Workshop (pp. 74–79). San Francisco, CA, USA: AAAI.</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>