<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.000888">
<title confidence="0.992225">
IITP: Hybrid Approach for Text Normalization in Twitter
</title>
<author confidence="0.99381">
Md Shad Akhtar, Utpal Kumar Sikdar and Asif Ekbal
</author>
<affiliation confidence="0.814676333333333">
Dept of Computer Science and Engineering
IIT Patna
Patna, India
</affiliation>
<email confidence="0.995189">
(shad.pcs15,utpal.sikdar,asif)@iitp.ac.in
</email>
<sectionHeader confidence="0.993848" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.9992404375">
In this paper we report our work for nor-
malization of noisy text in Twitter data.
The method we propose is hybrid in na-
ture that combines machine learning with
rules. In the first step, supervised ap-
proach based on conditional random field
is developed, and in the second step a set
of heuristics rules is applied to the can-
didate wordforms for the normalization.
The classifier is trained with a set of fea-
tures which were are derived without the
use of any domain-specific feature and/or
resource. The overall system yields the
precision, recall and F-measure values of
90.26%, 71.91% and 80.05% respectively
for the test dataset.
</bodyText>
<sectionHeader confidence="0.998993" genericHeader="introduction">
1 Introduction
</sectionHeader>
<bodyText confidence="0.999890722222222">
Twitter has seen a phenomenal growth in the num-
ber of users during the last few years. Over 500
million user accounts have been registered with it
with approx 302 million active users 1. Amount
of user generated contents over the web would
be unarguably enormous i.e. almost 500 million
tweets per day 2. The fact that Twitter data (or
tweets) are typically noisy and unstructured in na-
ture are due to several grammatical &amp; spelling
mistakes it contain. The size limitation (constitute
upto 140 characters only) is the another prominent
reason. It confines a user to devise different short
forms (e.g. ‘c u ltr.’ for ‘see you later.’) of a valid
word. Interpreting such forms may be an easier
task for a human being but, is very difficult to build
an accurate system for solving any problem related
to natural language processing (NLP). At times,
user puts extra emphasis by stretching/elongating
</bodyText>
<footnote confidence="0.827511666666667">
1http://en.wikipedia.org/wiki/Twitter
2http://www.cnet.com/news/report-twitter-hits-half-a-
billion-tweets-a-day/
</footnote>
<bodyText confidence="0.999871390243903">
a valid word to express their feelings. For exam-
ple, they often use word like ‘yeeessss’ to show
their happiness, which is a stretched form of ‘yes’.
Normalization of noisy text is an important and
necessary pre-processing task for building differ-
ent applications related to text processing. It
is pretty obvious from various studies (Liu et
al., 2011; Foster et al., 2011) that presence of
noisy texts makes any natural language process-
ing (NLP) task very tedious to achieve good ac-
curacy levels. The goal of normalization is two-
fold, i.e. a) identification of candidates for nor-
malization and b) converting the candidate word-
forms to the normalized form. Unlike the gen-
eral well-formatted corpus, like newswire, it does
not always contain noisy text. Its main sources
are normally those platforms on which users have
complete freedom to express themselves. There-
fore, user generated tweets are one of the major
sources of noisy texts. In the last couples of years
researchers across worldwide are actively working
for the normalization of noisy contents of twitter
(Han and Baldwin, 2011; Liu et al., 2012; Wang
and Ng, 2013; Porta and Sancho, 2013; Chru-
pala, 2014). In (Han and Baldwin, 2011), a lin-
ear Support Vector Machine (SVM) classifier was
trained for detecting ill-formed words, and then
performed normalization based on morphophone-
mic similarity. Application of edit operations and
recurrent neural embedding can be found in (Chru-
pala, 2014) for text normalization. Their method
learns sequence of edit operations using condi-
tional random field (CRF). In another work, (Liu
et al., 2012) investigated the human perspectives
of enhanced letter transformation, visual priming
and the phonetic similarity for the text normaliza-
tion. The use of beam search decoder and finite-
state transducers can be seen in (Wang and Ng,
2013; Porta and Sancho, 2013) for the word nor-
malization. These existing works are based on dif-
ferent setups and datasets.
</bodyText>
<page confidence="0.977216">
106
</page>
<note confidence="0.789129">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 106–110,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<bodyText confidence="0.999974586206896">
For further advancement of research on text nor-
malization and to provide a common benchmark
setup for evaluation, a shared task “ACL2015 W-
NUT: Normalization of Noisy Text in Twitter”3
was organized. The shared task had two vari-
ants: constrained mode and unconstrained mode.
We participated only for the constrained mode
which did not permit us to use any external re-
sources and/or tools except few that were rec-
ommended by the organizers. In this paper we
report our work for normalization. We imple-
mented a hybrid system where machine learn-
ing along with rules are utilized to perform the
task. We have exploited lexical and syntactic
properties of a tweet as discussed in section 3.1
to derive a feature set for identification of noisy
text in the first step. We train Conditional Ran-
dom Field (CRF) (Lafferty et al., 2001) as a ma-
chine learning algorithm to identify the candi-
date wordforms that need to be normalized. In
second step, we apply some rule based methods
(as defined in section 3.2) in order to normalize
the wordforms which were identified in first step.
The organization of the paper is as follows. A
brief theoretical discussion on CRF is presented
in section 2. Section 3 discuss about the feature
set and methodology used in the proposed work.
Experimental result and analysis can be found in
section 4. We conclude the paper in section 5.
</bodyText>
<sectionHeader confidence="0.948579" genericHeader="method">
2 Conditional Random Field (CRF)
</sectionHeader>
<bodyText confidence="0.927251428571428">
Conditional Random Field, introduced by (Laf-
ferty et al., 2001), is a robust sequence learning al-
gorithm based on the conditional probability. Let
an observation sequence O =&lt; o1, o2, ..., oT &gt;
is given, then the conditional probability of a state
sequence S =&lt; s1, s2, ..., sT &gt; can be formu-
lated as:
</bodyText>
<equation confidence="0.99111375">
T K
P(S|O) = 1 exp(E E λk X fk(st−1, st, o, t))
Z0 t=1 k=1
(1)
</equation>
<bodyText confidence="0.99987475">
where λk is the weight of the feature function
fk(st−1, st, o, t), that is to be learned via training.
In general, feature functions takes binary value but
at times it may range between −oo to +oo. The
output of this function relies on certain state se-
quence i.e. st−1, st and observation properties.
The normalization factor Zo, define in equation 2,
is used to make all conditional probabilities sum
</bodyText>
<footnote confidence="0.697307">
3http://noisy-text.github.io/
</footnote>
<bodyText confidence="0.911211">
up to unity and can be calculated efficiently using
dynamic programming.
</bodyText>
<equation confidence="0.9208965">
λk X fk(st−1, st, o, t))
(2)
</equation>
<sectionHeader confidence="0.99279" genericHeader="method">
3 Methods
</sectionHeader>
<bodyText confidence="0.9995734">
After discussing theoretical aspect of CRF, we
now describe our methodology that we use to per-
form text normalization. It comprises of two steps.
First step consists of training a supervised ma-
chine learning model for the identification of noisy
text. We implement a set of features that were
mostly derived without using any deep domain-
specific resources and/or tools. We perform 3-fold
cross validation on the training data to determine
the best feature combination. In the second step,
potential candidates identified to be noisy were
analysed and subsequently processed using vari-
ous heuristic based rules for normalization. Fig-
ure 1 depicts schematic diagram of the proposed
system.
</bodyText>
<figureCaption confidence="0.9135655">
Figure 1: Proposed methodology. Dotted hori-
zontal line separates two steps.
</figureCaption>
<subsectionHeader confidence="0.999581">
3.1 Feature Set
</subsectionHeader>
<bodyText confidence="0.999988375">
This section describes the feature set that was im-
plemented for identifying the potential candidates
that need to be normalized. All the features de-
fined are domain-independent in nature. No other
external resources and/or tools, with the exception
of vocabulary of words 4, were used in the pro-
posed work. Following are the brief descriptions
of the implemented features.
</bodyText>
<equation confidence="0.965128272727273">
4http://noisy-text.github.io/files/scowl.american.70
E
Z0 =
s
T
E
t=1
exp(
K
E
k=1
</equation>
<page confidence="0.978259">
107
</page>
<listItem confidence="0.976231022727273">
1. Local context: Local contextual information
in the forms of surrounding words are used as
the feature.
2. Vocabulary word: Noisy word can not be a
part of valid vocabulary. Therefore, all out-
of-vocabulary (OOV) are the potential candi-
dates that should be normalized. We define a
feature that fires if the current is OOV.
3. Part-of-Speech (PoS) information: We use
CMU-Tweet PoS tagger 5 for extracting the
PoS information. This is used as a feature of
CRF.
4. Word length: From the given training data
we observed that noisy texts are generally
shorter in lengths. We define a binary val-
ued feature that is set to high if the length of
the candidate token exceeds a predetermined
threshold. In our case we assume the token
to be a noisy text if its length is less than 4
characters.
5. Suffix and Prefix: Suffixes and prefixes of
length upto 4 characters of the current word
are used as the features.
6. Only digit: This feature checks whether the
current token is consisting of only digits or
not. The word has a low probability of be-
ing noisy if it contains only the digits. Few
exceptions are 2(to), 4(for) etc.
7. AlphaDigit: An alphanumeric token have a
high probability of being a noisy text. A bi-
nary valued feature is thus defined in the pro-
posed work which fires when the token is al-
phanumeric.
8. Consecutive characters: This feature fires
when a token consists of more than 2 consec-
utive characters is found. This feature helps
in identifying the stretched/elongated words.
9. Compact word form: Apostrophe mark (’)
is used to indicate the omission of one or
more letters from a word (e.g. i’m, you’re
etc.). A binary feature is defined which iden-
tifies the missing apostrophe mark in a word.
10. Present participle (a.k.a ing-form) of a
verb: From the analysis of training data we
</listItem>
<footnote confidence="0.843901">
5http://www.ark.cs.cmu.edu/TweetNLP/
</footnote>
<bodyText confidence="0.9993855">
observed that people tends to skip ‘i’ or ‘g’
from the present participle, i.e. ing form, of
a verb. For example, they use goin in place
of going. Thus a feature is defined and set to
‘on’ if a token is found with the above pat-
tern.
</bodyText>
<listItem confidence="0.779965111111111">
11. Single character: This feature fires when the
token consists of a single character only with
the exception of two characters i.e. ‘I’ and
‘a’.
12. Hash tag &amp; Username: Hash tags and user-
names in tweets, which starts with # &amp; @ re-
spectively, are not considered as noisy text in
the training data. Therefore this feature is set
to false if a token starts with # or @.
</listItem>
<subsectionHeader confidence="0.998034">
3.2 Heuristic rules for normalization
</subsectionHeader>
<bodyText confidence="0.999218166666667">
Once the noisy text was identified in the first step,
we devise a set of rules for normalization. These
rules are heuristic in nature and based on the facts
&amp; analysis on the training data. Below is the list of
rules implemented according to their application
in the proposed work.
</bodyText>
<listItem confidence="0.6594453">
1. Frequent abbreviation: This is the first rule
that we apply on the noisy text. We make
use of a list of frequent abbreviations used
in twitter and its normal form. The list was
compiled from the Web 6,7 and training data.
If the token identified as a potential candidate
in the first step is present in the list we simply
replace it with the normal text, otherwise, we
move onto the next rule.
2. Present participle of a verb: A rule is de-
fined for a misspelled present participle verb
as discussed in section 3.1. We identify and
cross check its PoS tag (i.e. VERB) in order
to retrieve its valid equivalent form.
3. Missing apostrophe(’): Twitter users nor-
mally drops apostrophe mark in tweets. We
define a rule to identify and insert a apostro-
phe mark at proper place. This rule was em-
ployed for handling following variants: ’m,
’ll, ’ve, ’re, n’t, ’s etc.
</listItem>
<footnote confidence="0.971835333333333">
6http://www.webopedia.com/quick ref/Twitter Dictionary Guide.asp
7http://marketing.wtwhmedia.com/30-must-know-
twitter-abbreviations-and-acronyms/
</footnote>
<page confidence="0.996929">
108
</page>
<bodyText confidence="0.689811">
4. Elongated form: Noisy word in its elon-
gated form (i.e. yeeeeesss for yes) are iden-
tified and translated into valid word by itera-
tively stripping off consecutive characters.
</bodyText>
<table confidence="0.989214666666667">
Dataset # Tweets # Tokens # Noisy
train 2950 44385 3942
test 1967 29421 2776
</table>
<tableCaption confidence="0.999144">
Table 1: Statistics of the dataset
</tableCaption>
<listItem confidence="0.83680616">
5. Split two merged words: This rule splits a
noisy word, if it is a concatenation of two
valid words. For example ‘thankyou’ is con-
catenation of two separate words i.e. ‘thank’
and ‘you’. We find out word pair at each
split point and applied this rule for the pair
that has both valid word. Token ‘thankyou’
has following word pair for 7 spit point: i.e.
(1: ‘t’, ‘hankyou’; 2: ‘th’, ‘ankyou’; 3: ‘tha’,
‘nkyou’; 4: ‘than’, ‘kyou’; 5: ‘thank, ‘you’;
6: ‘thanky’, ‘ou’; and 7: ‘thankyo’, ‘u’;).
Word pair (‘thank, ‘you’) at split point 5 is
chosen for the normalization. Before apply-
ing this rule a threshold for word length was
heuristically set to 6 characters.
6. British to American standard: American
standard was preferred as an official English
language standard for the shared task. We de-
fine a rule which identifies British standard
word and convert it to corresponding Amer-
ican standard counterpart. Notable differ-
ences between the two standards that we have
incorporated in the work are ‘our’ to ‘or’ (e.g.
labour to labor), ‘ise’ to ‘ize’ (e.g. realise to
realize), ‘re’ to ‘er’ (e.g. centre to center) etc.
</listItem>
<sectionHeader confidence="0.926705" genericHeader="evaluation">
4 Datasets and Experiments
</sectionHeader>
<bodyText confidence="0.997709333333333">
In subsequent subsections we discuss the dataset
used in the system and evaluation results, respec-
tively.
</bodyText>
<subsectionHeader confidence="0.999128">
4.1 Data Set
</subsectionHeader>
<bodyText confidence="0.9999788">
Objective of the shared task was to identify and
normalize the noisy text in tweets. Only training
dataset was provided by the shared task organiz-
ers. The training dataset comprise of 2,950 tweets
and a total of 3,942 noisy tokens were present
in the dataset. In absence of the development
dataset, we use 3-fold cross validation for train-
ing the model. Gold standard test datasets con-
tains 1,967 tweets. Table 1 list the statistics of the
datasets.
</bodyText>
<subsectionHeader confidence="0.984908">
4.2 Experimental Results
</subsectionHeader>
<bodyText confidence="0.99889041025641">
Conditional Random Field (CRF)(Lafferty et al.,
2001) was used as a base learning algorithm in the
proposed work. We use the CRF++ 8 based pack-
age for training and testing. To evaluate the per-
formance of the system, an evaluation script along
with the dataset was provided by the organizers.
We perform 3-fold cross-validation technique to
fine-tune the system, and identify the best fitting
feature combination. The performance of 3-fold
cross validation experiment yields the F-measure
of 92.21% for identification problem (i.e. denot-
ing only the candidates for normalization). For
the test set it shows the F-measure of 86.63%. Af-
ter identifying the candidates of normalization we
apply heuristics to perform normalization. Rules
were applied according to their appearance. We
have tried various combination of rule sequences
and found that the listed sequence is the one which
gives us better performance. While we perform
3-fold cross validation we obtain the precision,
recall and F-measure values of 88.59%, 74.92%
and 81.19%, respectively. Finally we obtain the
precision, recall and F-measure values of 90.26%,
71.91% and 80.05%, respectively. Results of these
experiments are shown in Table 2.
We closely analyze the errors encountered by
our system. We observed that many errors were
due to the incorrect identification of the candidates
that need to be normalized. The jumbled words,
e.g. ‘liek’, ’whta’ etc. were not properly recog-
nized. With more accurate identification system
we would have achieved better result. For example
in case of 100% noisy text identification, we ob-
tained an increase of 3.75% in our final F-measure.
For normalization error, our method arguably lags
behind in two fronts: a) ambiguities in normal-
ization and b) many-to-one mapping cases. Many
of these may be reduced by careful design of the
heuristic rules.
</bodyText>
<sectionHeader confidence="0.999258" genericHeader="conclusions">
5 Conclusion
</sectionHeader>
<bodyText confidence="0.9929725">
In this paper we have reported our works that we
carried out as part of our participation in the Twit-
ter text normalization shared task. We have de-
veloped a hybrid system where in the first step
</bodyText>
<footnote confidence="0.974191">
8http://taku910.github.io/crfpp/
</footnote>
<page confidence="0.9918">
109
</page>
<table confidence="0.9986112">
Task Dataset Precision Recall F-measure Accuracy
Identification 3-fold cv 89.51 95.08 92.21 98.70
test 93.08 81.01 86.63 97.64
Normalization 3-fold cv 88.59 74.92 81.19 -
test 90.26 71.91 80.05 -
</table>
<tableCaption confidence="0.999433">
Table 2: Result of the proposed system. All values are in %.
</tableCaption>
<bodyText confidence="0.9999126875">
we identify the candidates for normalization us-
ing a CRF based approach, and in the second step
we employed several heuristics for converting the
wordforms into the normalized form. We have im-
plemented the features which are mostly domain-
independent in the sense that we did not make use
of any domain specific resources and/or tools for
their extraction. Official evaluation shows that our
system achieves the F-measure of 80.05%.
In future we would like to carry out more com-
prehensive analysis on the evaluation results. The
features and rules that we used here are very gen-
eral and straightforward in nature. In future we
would like to modify the system into a fully ma-
chine learning based approach and put extra em-
phasis on errors.
</bodyText>
<sectionHeader confidence="0.999062" genericHeader="references">
References
</sectionHeader>
<reference confidence="0.999623566037736">
Grzegorz Chrupala. 2014. Normalizing tweets with
edit scripts and recurrent neural embeddings. In
Proceedings of the 52nd Annual Meeting of the As-
sociation for Computational Linguistics, ACL 2014,
June 22-27, 2014, Baltimore, MD, USA, Volume 2:
Short Papers, pages 680–686.
Jennifer Foster, ¨Ozlem C¸etinoglu, Joachim Wagner,
Joseph Le Roux, Stephen Hogan, Joakim Nivre,
Deirdre Hogan, and Josef van Genabith. 2011.
#hardtoparse: POS tagging and parsing the twitter-
verse. In Analyzing Microtext, Papers from the 2011
AAAI Workshop, San Francisco, California, USA,
August 8, 2011.
Bo Han and Timothy Baldwin. 2011. Lexical normal-
isation of short text messages: Makn sens a #twit-
ter. In Proceedings of the 49th Annual Meeting of
the Association for Computational Linguistics: Hu-
man Language Technologies - Volume 1, HLT ’11,
pages 368–378, Stroudsburg, PA, USA. Association
for Computational Linguistics.
John D. Lafferty, Andrew McCallum, and Fernando
C. N. Pereira. 2001. Conditional Random Fields:
Probabilistic Models for Segmenting and Labeling
Sequence Data. In ICML, pages 282–289.
Xiaohua Liu, Shaodian Zhang, Furu Wei, and Ming
Zhou. 2011. Recognizing named entities in tweets.
In Proceedings of the 49th Annual Meeting of the
Association for Computational Linguistics: Human
Language Technologies - Volume 1, HLT ’11, pages
359–367, Stroudsburg, PA, USA. Association for
Computational Linguistics.
Fei Liu, Fuliang Weng, and Xiao Jiang. 2012. A
broad-coverage normalization system for social me-
dia language. In Proceedings of the 50th Annual
Meeting of the Association for Computational Lin-
guistics: Long Papers - Volume 1, ACL ’12, pages
1035–1044, Stroudsburg, PA, USA. Association for
Computational Linguistics.
Jordi Porta and Jos´e-Luis Sancho. 2013. Word nor-
malization in twitter using finite-state transducers.
In Proceedings of the Tweet Normalization Work-
shop co-located with 29th Conference of the Span-
ish Society for Natural Language Processing (SE-
PLN 2013), Madrid, Spain, September 20th, 2013.,
pages 49–53.
Pidong Wang and Hwee Tou Ng. 2013. A beam-search
decoder for normalization of social media text with
application to machine translation. In Human Lan-
guage Technologies: Conference of the North Amer-
ican Chapter of the Association of Computational
Linguistics, Proceedings, June 9-14, 2013, Westin
Peachtree Plaza Hotel, Atlanta, Georgia, USA,
pages 471–481.
</reference>
<page confidence="0.998378">
110
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.480130">
<title confidence="0.999202">IITP: Hybrid Approach for Text Normalization in Twitter</title>
<author confidence="0.979531">Md Shad Akhtar</author>
<author confidence="0.979531">Utpal Kumar Sikdar</author>
<author confidence="0.979531">Asif</author>
<affiliation confidence="0.846770333333333">Dept of Computer Science and IIT Patna,</affiliation>
<email confidence="0.936203">(shad.pcs15,utpal.sikdar,asif)@iitp.ac.in</email>
<abstract confidence="0.998477941176471">In this paper we report our work for normalization of noisy text in Twitter data. The method we propose is hybrid in nature that combines machine learning with rules. In the first step, supervised approach based on conditional random field is developed, and in the second step a set of heuristics rules is applied to the candidate wordforms for the normalization. The classifier is trained with a set of features which were are derived without the use of any domain-specific feature and/or resource. The overall system yields the precision, recall and F-measure values of 90.26%, 71.91% and 80.05% respectively for the test dataset.</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="true">
<authors>
<author>Grzegorz Chrupala</author>
</authors>
<title>Normalizing tweets with edit scripts and recurrent neural embeddings.</title>
<date>2014</date>
<booktitle>In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics, ACL</booktitle>
<pages>680--686</pages>
<location>Baltimore, MD, USA, Volume</location>
<contexts>
<context position="3045" citStr="Chrupala, 2014" startWordPosition="484" endWordPosition="486">cation of candidates for normalization and b) converting the candidate wordforms to the normalized form. Unlike the general well-formatted corpus, like newswire, it does not always contain noisy text. Its main sources are normally those platforms on which users have complete freedom to express themselves. Therefore, user generated tweets are one of the major sources of noisy texts. In the last couples of years researchers across worldwide are actively working for the normalization of noisy contents of twitter (Han and Baldwin, 2011; Liu et al., 2012; Wang and Ng, 2013; Porta and Sancho, 2013; Chrupala, 2014). In (Han and Baldwin, 2011), a linear Support Vector Machine (SVM) classifier was trained for detecting ill-formed words, and then performed normalization based on morphophonemic similarity. Application of edit operations and recurrent neural embedding can be found in (Chrupala, 2014) for text normalization. Their method learns sequence of edit operations using conditional random field (CRF). In another work, (Liu et al., 2012) investigated the human perspectives of enhanced letter transformation, visual priming and the phonetic similarity for the text normalization. The use of beam search de</context>
</contexts>
<marker>Chrupala, 2014</marker>
<rawString>Grzegorz Chrupala. 2014. Normalizing tweets with edit scripts and recurrent neural embeddings. In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics, ACL 2014, June 22-27, 2014, Baltimore, MD, USA, Volume 2: Short Papers, pages 680–686.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Jennifer Foster</author>
<author>¨Ozlem C¸etinoglu</author>
<author>Joachim Wagner</author>
<author>Joseph Le Roux</author>
<author>Stephen Hogan</author>
<author>Joakim Nivre</author>
<author>Deirdre Hogan</author>
<author>Josef van Genabith</author>
</authors>
<title>hardtoparse: POS tagging and parsing the twitterverse.</title>
<date>2011</date>
<booktitle>In Analyzing Microtext, Papers from the 2011 AAAI Workshop,</booktitle>
<location>San Francisco, California, USA,</location>
<marker>Foster, C¸etinoglu, Wagner, Le Roux, Hogan, Nivre, Hogan, van Genabith, 2011</marker>
<rawString>Jennifer Foster, ¨Ozlem C¸etinoglu, Joachim Wagner, Joseph Le Roux, Stephen Hogan, Joakim Nivre, Deirdre Hogan, and Josef van Genabith. 2011. #hardtoparse: POS tagging and parsing the twitterverse. In Analyzing Microtext, Papers from the 2011 AAAI Workshop, San Francisco, California, USA, August 8, 2011.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Bo Han</author>
<author>Timothy Baldwin</author>
</authors>
<title>Lexical normalisation of short text messages: Makn sens a #twitter.</title>
<date>2011</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies - Volume 1, HLT ’11,</booktitle>
<pages>368--378</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA.</location>
<contexts>
<context position="2967" citStr="Han and Baldwin, 2011" startWordPosition="468" endWordPosition="471">achieve good accuracy levels. The goal of normalization is twofold, i.e. a) identification of candidates for normalization and b) converting the candidate wordforms to the normalized form. Unlike the general well-formatted corpus, like newswire, it does not always contain noisy text. Its main sources are normally those platforms on which users have complete freedom to express themselves. Therefore, user generated tweets are one of the major sources of noisy texts. In the last couples of years researchers across worldwide are actively working for the normalization of noisy contents of twitter (Han and Baldwin, 2011; Liu et al., 2012; Wang and Ng, 2013; Porta and Sancho, 2013; Chrupala, 2014). In (Han and Baldwin, 2011), a linear Support Vector Machine (SVM) classifier was trained for detecting ill-formed words, and then performed normalization based on morphophonemic similarity. Application of edit operations and recurrent neural embedding can be found in (Chrupala, 2014) for text normalization. Their method learns sequence of edit operations using conditional random field (CRF). In another work, (Liu et al., 2012) investigated the human perspectives of enhanced letter transformation, visual priming and</context>
</contexts>
<marker>Han, Baldwin, 2011</marker>
<rawString>Bo Han and Timothy Baldwin. 2011. Lexical normalisation of short text messages: Makn sens a #twitter. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies - Volume 1, HLT ’11, pages 368–378, Stroudsburg, PA, USA. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>John D Lafferty</author>
<author>Andrew McCallum</author>
<author>Fernando C N Pereira</author>
</authors>
<title>Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data. In</title>
<date>2001</date>
<booktitle>ICML,</booktitle>
<pages>282--289</pages>
<contexts>
<context position="4822" citStr="Lafferty et al., 2001" startWordPosition="770" endWordPosition="773">shared task had two variants: constrained mode and unconstrained mode. We participated only for the constrained mode which did not permit us to use any external resources and/or tools except few that were recommended by the organizers. In this paper we report our work for normalization. We implemented a hybrid system where machine learning along with rules are utilized to perform the task. We have exploited lexical and syntactic properties of a tweet as discussed in section 3.1 to derive a feature set for identification of noisy text in the first step. We train Conditional Random Field (CRF) (Lafferty et al., 2001) as a machine learning algorithm to identify the candidate wordforms that need to be normalized. In second step, we apply some rule based methods (as defined in section 3.2) in order to normalize the wordforms which were identified in first step. The organization of the paper is as follows. A brief theoretical discussion on CRF is presented in section 2. Section 3 discuss about the feature set and methodology used in the proposed work. Experimental result and analysis can be found in section 4. We conclude the paper in section 5. 2 Conditional Random Field (CRF) Conditional Random Field, intro</context>
<context position="13391" citStr="Lafferty et al., 2001" startWordPosition="2230" endWordPosition="2233"> we discuss the dataset used in the system and evaluation results, respectively. 4.1 Data Set Objective of the shared task was to identify and normalize the noisy text in tweets. Only training dataset was provided by the shared task organizers. The training dataset comprise of 2,950 tweets and a total of 3,942 noisy tokens were present in the dataset. In absence of the development dataset, we use 3-fold cross validation for training the model. Gold standard test datasets contains 1,967 tweets. Table 1 list the statistics of the datasets. 4.2 Experimental Results Conditional Random Field (CRF)(Lafferty et al., 2001) was used as a base learning algorithm in the proposed work. We use the CRF++ 8 based package for training and testing. To evaluate the performance of the system, an evaluation script along with the dataset was provided by the organizers. We perform 3-fold cross-validation technique to fine-tune the system, and identify the best fitting feature combination. The performance of 3-fold cross validation experiment yields the F-measure of 92.21% for identification problem (i.e. denoting only the candidates for normalization). For the test set it shows the F-measure of 86.63%. After identifying the </context>
</contexts>
<marker>Lafferty, McCallum, Pereira, 2001</marker>
<rawString>John D. Lafferty, Andrew McCallum, and Fernando C. N. Pereira. 2001. Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data. In ICML, pages 282–289.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Xiaohua Liu</author>
<author>Shaodian Zhang</author>
<author>Furu Wei</author>
<author>Ming Zhou</author>
</authors>
<title>Recognizing named entities in tweets.</title>
<date>2011</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies - Volume 1, HLT ’11,</booktitle>
<pages>359--367</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA.</location>
<contexts>
<context position="2229" citStr="Liu et al., 2011" startWordPosition="348" endWordPosition="351">ild an accurate system for solving any problem related to natural language processing (NLP). At times, user puts extra emphasis by stretching/elongating 1http://en.wikipedia.org/wiki/Twitter 2http://www.cnet.com/news/report-twitter-hits-half-abillion-tweets-a-day/ a valid word to express their feelings. For example, they often use word like ‘yeeessss’ to show their happiness, which is a stretched form of ‘yes’. Normalization of noisy text is an important and necessary pre-processing task for building different applications related to text processing. It is pretty obvious from various studies (Liu et al., 2011; Foster et al., 2011) that presence of noisy texts makes any natural language processing (NLP) task very tedious to achieve good accuracy levels. The goal of normalization is twofold, i.e. a) identification of candidates for normalization and b) converting the candidate wordforms to the normalized form. Unlike the general well-formatted corpus, like newswire, it does not always contain noisy text. Its main sources are normally those platforms on which users have complete freedom to express themselves. Therefore, user generated tweets are one of the major sources of noisy texts. In the last co</context>
</contexts>
<marker>Liu, Zhang, Wei, Zhou, 2011</marker>
<rawString>Xiaohua Liu, Shaodian Zhang, Furu Wei, and Ming Zhou. 2011. Recognizing named entities in tweets. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies - Volume 1, HLT ’11, pages 359–367, Stroudsburg, PA, USA. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Fei Liu</author>
<author>Fuliang Weng</author>
<author>Xiao Jiang</author>
</authors>
<title>A broad-coverage normalization system for social media language.</title>
<date>2012</date>
<booktitle>In Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics: Long Papers - Volume 1, ACL ’12,</booktitle>
<pages>1035--1044</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA.</location>
<contexts>
<context position="2985" citStr="Liu et al., 2012" startWordPosition="472" endWordPosition="475">evels. The goal of normalization is twofold, i.e. a) identification of candidates for normalization and b) converting the candidate wordforms to the normalized form. Unlike the general well-formatted corpus, like newswire, it does not always contain noisy text. Its main sources are normally those platforms on which users have complete freedom to express themselves. Therefore, user generated tweets are one of the major sources of noisy texts. In the last couples of years researchers across worldwide are actively working for the normalization of noisy contents of twitter (Han and Baldwin, 2011; Liu et al., 2012; Wang and Ng, 2013; Porta and Sancho, 2013; Chrupala, 2014). In (Han and Baldwin, 2011), a linear Support Vector Machine (SVM) classifier was trained for detecting ill-formed words, and then performed normalization based on morphophonemic similarity. Application of edit operations and recurrent neural embedding can be found in (Chrupala, 2014) for text normalization. Their method learns sequence of edit operations using conditional random field (CRF). In another work, (Liu et al., 2012) investigated the human perspectives of enhanced letter transformation, visual priming and the phonetic simi</context>
</contexts>
<marker>Liu, Weng, Jiang, 2012</marker>
<rawString>Fei Liu, Fuliang Weng, and Xiao Jiang. 2012. A broad-coverage normalization system for social media language. In Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics: Long Papers - Volume 1, ACL ’12, pages 1035–1044, Stroudsburg, PA, USA. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Jordi Porta</author>
<author>Jos´e-Luis Sancho</author>
</authors>
<title>Word normalization in twitter using finite-state transducers.</title>
<date>2013</date>
<booktitle>In Proceedings of the Tweet Normalization Workshop co-located with 29th Conference of the Spanish Society for Natural Language Processing (SEPLN 2013),</booktitle>
<pages>49--53</pages>
<location>Madrid, Spain,</location>
<contexts>
<context position="3028" citStr="Porta and Sancho, 2013" startWordPosition="480" endWordPosition="483">wofold, i.e. a) identification of candidates for normalization and b) converting the candidate wordforms to the normalized form. Unlike the general well-formatted corpus, like newswire, it does not always contain noisy text. Its main sources are normally those platforms on which users have complete freedom to express themselves. Therefore, user generated tweets are one of the major sources of noisy texts. In the last couples of years researchers across worldwide are actively working for the normalization of noisy contents of twitter (Han and Baldwin, 2011; Liu et al., 2012; Wang and Ng, 2013; Porta and Sancho, 2013; Chrupala, 2014). In (Han and Baldwin, 2011), a linear Support Vector Machine (SVM) classifier was trained for detecting ill-formed words, and then performed normalization based on morphophonemic similarity. Application of edit operations and recurrent neural embedding can be found in (Chrupala, 2014) for text normalization. Their method learns sequence of edit operations using conditional random field (CRF). In another work, (Liu et al., 2012) investigated the human perspectives of enhanced letter transformation, visual priming and the phonetic similarity for the text normalization. The use </context>
</contexts>
<marker>Porta, Sancho, 2013</marker>
<rawString>Jordi Porta and Jos´e-Luis Sancho. 2013. Word normalization in twitter using finite-state transducers. In Proceedings of the Tweet Normalization Workshop co-located with 29th Conference of the Spanish Society for Natural Language Processing (SEPLN 2013), Madrid, Spain, September 20th, 2013., pages 49–53.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Pidong Wang</author>
<author>Hwee Tou Ng</author>
</authors>
<title>A beam-search decoder for normalization of social media text with application to machine translation.</title>
<date>2013</date>
<booktitle>In Human Language Technologies: Conference of the North American Chapter of the Association of Computational Linguistics, Proceedings,</booktitle>
<pages>471--481</pages>
<location>Atlanta, Georgia, USA,</location>
<contexts>
<context position="3004" citStr="Wang and Ng, 2013" startWordPosition="476" endWordPosition="479"> normalization is twofold, i.e. a) identification of candidates for normalization and b) converting the candidate wordforms to the normalized form. Unlike the general well-formatted corpus, like newswire, it does not always contain noisy text. Its main sources are normally those platforms on which users have complete freedom to express themselves. Therefore, user generated tweets are one of the major sources of noisy texts. In the last couples of years researchers across worldwide are actively working for the normalization of noisy contents of twitter (Han and Baldwin, 2011; Liu et al., 2012; Wang and Ng, 2013; Porta and Sancho, 2013; Chrupala, 2014). In (Han and Baldwin, 2011), a linear Support Vector Machine (SVM) classifier was trained for detecting ill-formed words, and then performed normalization based on morphophonemic similarity. Application of edit operations and recurrent neural embedding can be found in (Chrupala, 2014) for text normalization. Their method learns sequence of edit operations using conditional random field (CRF). In another work, (Liu et al., 2012) investigated the human perspectives of enhanced letter transformation, visual priming and the phonetic similarity for the text</context>
</contexts>
<marker>Wang, Ng, 2013</marker>
<rawString>Pidong Wang and Hwee Tou Ng. 2013. A beam-search decoder for normalization of social media text with application to machine translation. In Human Language Technologies: Conference of the North American Chapter of the Association of Computational Linguistics, Proceedings, June 9-14, 2013, Westin Peachtree Plaza Hotel, Atlanta, Georgia, USA, pages 471–481.</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>