<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.005761">
<title confidence="0.9167115">
LYSGROUP:
Adapting a Spanish microtext normalization system to English.
</title>
<author confidence="0.970429">
Yerai Doval, Jes´us Vilares, Carlos G´omez-Rodriguez
</author>
<affiliation confidence="0.7155095">
LYS group, Departamento de Computaci´on, Facultade de Inform´atica,
Universidade da Coru˜na, Campus de Elvi˜na, 15071 A Coru˜na, Spain
</affiliation>
<email confidence="0.995151">
{yerai.doval, jvilares, cgomezr}@udc.es – www.grupolys.org
</email>
<sectionHeader confidence="0.997367" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.9999390625">
In this article we describe the microtext
normalization system we have used to par-
ticipate in the Normalization of Noisy Text
Task of the ACL W-NUT 2015 Workshop.
Our normalization system was originally
developed for text mining tasks on Span-
ish tweets. Our main goals during its de-
velopment were flexibility, scalability and
maintainability, in order to test a wide va-
riety of approximations to the problem at
hand with minimum effort. We will pay
special attention to the process of adapting
the components of our system to deal with
English tweets which, as we will show,
was achieved without major modifications
of its base structure.
</bodyText>
<sectionHeader confidence="0.999516" genericHeader="keywords">
1 Introduction
</sectionHeader>
<bodyText confidence="0.999981414634146">
The value of Twitter and other microblogging ser-
vices as information sources in domains like mar-
keting, business intelligence, journalism, etc. is
obvious nowadays. Nevertheless, such amount of
information can only be appropriately exploited
through text mining techniques.
However, there are notable differences between
“standard” language and the so-called texting used
in those microtexts. In this kind of writings, it is
important to reduce the number of characters used
to fit their length restrictions while maintaining
the readability of the message to some extent. To
achieve this, most of the techniques applied rely
on phonetics, thus being language-specific (L´opez
R´ua, 2007). For example: intentionally ignoring
orthographic and grammar rules, as in “be like” for
“am/is/are/was/were like” in the case of English
or “asique” for “asique” in the case of Spanish;
the usage of shortenings, contractions and abbre-
viations such as “c u” for “see you” in English or
“ksa” for “casa” in Spanish; or the employment of
smileys to express emotions, for instance :) to ex-
press happiness. These resulting terms are called
lexical variants (Han et al., 2013).
The problem is that, in general, text mining
tools are very sensitive to those phenomena, as
they are designed for dealing with standard texts.
Therefore, it is necessary to normalize these texts
before their processing, that is, to transform them
into standard language. This way “c u nxt week”,
for example, would be transformed into “see you
next week”. This is the goal of the W-NUT 2015
Normalization Task (Baldwin et al., 2015).
The rest of this paper is organized as follows:
Section 2 describes the core architecture of our
system, and how it was adapted to fit this shared
task, and Section 3 presents the resources used.
Next, Section 4 evaluates the system and discusses
the results obtained. Finally, Section 5 presents
our conclusions and considers some possible fu-
ture improvements for our system.
</bodyText>
<sectionHeader confidence="0.994056" genericHeader="introduction">
2 Architecture
</sectionHeader>
<bodyText confidence="0.992508142857143">
Our tweet normalization system was developed
taking as basic premises its flexibility, scalabil-
ity and maintainability. As a starting point, we
took a previous prototype for Spanish tweet nor-
malization (Vilares et al., 2013) which, although
fully functional, did not turn out to be as flexi-
ble and maintainable as expected. This could have
become a problem for future developments, since
the adaptation effort needed to integrate new tech-
niques would have been too large, so we decided
to refactor the whole system to solve this.
The general scheme of the original system mim-
ics that of Han and Baldwin (2011) and comprises
three stages:
</bodyText>
<listItem confidence="0.971996">
1. Tweet preprocessing.
2. In-vocabulary word identification (IV), based
on the lexicon of the system, obtaining as
</listItem>
<page confidence="0.976902">
99
</page>
<note confidence="0.7913485">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 99–105,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<bodyText confidence="0.99048185">
a result an initial set of out-of-vocabulary
words (OOV).
3. OOV set processing in order to distinguish be-
tween correct words which are out of the sys-
tem lexicon and proper lexical variants, ob-
taining for each one of the latter a normal-
ized form. This last step can be in turn de-
composed into two: the first one, which gen-
erates a set of possible normalization candi-
dates based on the application of certain nor-
malization techniques; and the second one,
which selects one of these candidates as the
normalized form (in our case, in a score-
driven process).
As for the particular normalization techniques em-
ployed throughout our system, we decided to try
first a combination of two of the traditional ap-
proximations to this task (Kobus et al., 2008): the
spell checking and the automatic speech recogni-
tion metaphors.
</bodyText>
<subsectionHeader confidence="0.997882">
2.1 The pipeline
</subsectionHeader>
<bodyText confidence="0.999944521126761">
We decided to give our system an object oriented
approach (using JAVA) as opposed to the impera-
tive approach of the original prototype (in PERL).
The new system is structured in processors, for-
merly known as modules in the prototype, whose
goal is to apply a certain process to the input
tweets so that we can obtain the normalization
candidates of their terms at its output.
The core component of our system is the
pipeline, consisting of a classic cascade structure
where we can insert an arbitrary number of pro-
cessors and have their inputs and outputs automat-
ically linked. In this way, the original input of the
system becomes the input of the first processor, the
output of the first processor is the input of the sec-
ond one, the output of this second processor is the
input of the third one, and so on, until reaching the
last processor, whose output becomes the output
of the system.
Regarding its design, we have followed good
engineering practices and made extensive use of
design patterns. Among them, it should be noted
the use of the decorator pattern which, in our con-
text, represents a simple pipeline, allowing us to
dynamically stack an arbitrary number of proces-
sors. Its combination with the composition pattern
lets us group them into stages, which enable the
definition of particular processor sequences while
still sharing the same basic processor interface,
thus preserving the flexibility of the decorator.
Thereby, the resulting structure allows for the dy-
namic construction of different pipeline configu-
rations of varying complexity and different levels
of abstraction, not being restricted to the original
settings.
The application of the template pattern allowed
us to factorize great part of the common processes
of the components, such as the sequential iteration
through all the input tweets, which most of the
processors perform. This resulted in a great ho-
mogenization of the code, thus simplifying main-
tenance and allowing us to focus our efforts on the
specific implementation of the processing methods
in each case.
Moreover, some processors make use of exter-
nal tools capable of being changed even at runtime
— something of special interest in multilingual en-
vironments. It should also be possible to integrate
them into other external components, so that their
logic can be reused by others. All this involves
decoupling the processors from the specific imple-
mentations of the external components employed,
which we have achieved through the use of the in-
version of control pattern.
Furthermore, communication between the com-
ponents of the pipeline is done through structured
text files, allowing us to gain flexibility as we can
integrate and exchange with ease new processing
modules regardless of their particular implemen-
tation (Vilares et al., 2013). In this case we have
used XML along with an implementation of the ab-
stract factory pattern for its construction and pars-
ing. This also facilitates possible future migra-
tions to other data representation languages, such
as JSON.
Finally, we have created a dynamic configura-
tion subsystem based on XML files that allows us to
define and instantiate the particular structure of the
pipeline on which we want to process the tweets.
The advantages of such a subsystem are clear, both
for system maintainability and testing:
</bodyText>
<listItem confidence="0.980742571428571">
1. It improves the multilingual support of the
system by enabling the definition of configu-
rations that use processors and resources de-
signed for a particular language.
2. It allows for experimentation in a simple, ag-
ile and documented (the configuration file it-
self also serves as documentation) manner.
</listItem>
<page confidence="0.96738">
100
</page>
<bodyText confidence="0.998797666666667">
on the Metaphone algorithm (Philips, 1990)
and a new Spanish dictionary extracted from
Wikimedia resources.2
</bodyText>
<listItem confidence="0.85274">
3. It avoids the necessity of modifying the sys-
tem source code.
</listItem>
<subsectionHeader confidence="0.997501">
2.2 Configuration before W-NUT 2015
</subsectionHeader>
<bodyText confidence="0.998846642857143">
The current processor configuration for Spanish
tweet normalization derives from that one used
by the initial prototype for its participation in the
TweetNorm 2013 task (Alegria et al., 2013). The
general procedure works like this: firstly, using
processors to prepare the input (preprocessing);
secondly, employing those whose purpose is to
obtain new normalization forms (candidates gen-
eration); thirdly, using those in charge of select-
ing or filtering the best normalization forms (can-
didate filtering/selection); and lastly, employing
those which prepare the final output of the system
(postprocessing). Such setup includes the follow-
ing processors:
</bodyText>
<listItem confidence="0.999866875">
• FreelingProcessor, which reads the
input data in the TweetNorm 2013 format
and uses Freeling (Padr´o and Stanilovsky,
2012) to perform the tokenization, lemmati-
zation and POS tagging (although these tags
are not currently in use) of the text of the
tweet.
• MentionProcessor,
</listItem>
<bodyText confidence="0.8221475">
HashtagProcessor, URLProcessor
and SmileyProcessor, which act as
filters for OOVs we do not want to consider
for normalization.
</bodyText>
<listItem confidence="0.9951708125">
• LaughESProcessor, which normalizes
laugh string representations, as in “ja” for
“jajaja”.
• PhoneticProcessor, which uses a pho-
netic table to map characters to their phonetic
equivalent strings, such as “x” to “por”.1
• SMSDictionaryProcessor, which
looks for normalization candidates in an SMS
dictionary, for example “tambi´en” (too/also)
for “tb”.
• AspellProcessor, which obtains nor-
malization candidates using the spell checker
aspell (Aspell, 2011), as in “pol´emica”
(controversy) for “polemik”. It should be
noted that this tool has been customised
with a new phonetic table for Spanish, based
</listItem>
<footnote confidence="0.936193">
1The character “x” resembles the multiplication (times)
sign ×, which in Spanish is read as “por”.
</footnote>
<bodyText confidence="0.996469666666667">
AffixESProcessor, which identifies and
normalizes affix-derived Spanish forms of
base words, also supporting phonetical writ-
ing, as in the case of “chikiyo” for “chiquillo”
(little boy), obtained from “chico” with the
suffix “-illo” (little/small).
</bodyText>
<listItem confidence="0.998022416666667">
• NGramProcessor, which calculates the
scores of those most likely normalization
candidates according to the Viterbi algo-
rithm (Manning and Sch¨utze, 1999, Ch. 9)
taking as reference the Web 1T 5-gram
v1 (Brants and Franz, 2006) Spanish lan-
guage model.
• CandidateProcessor, which selects the
top-scoring candidate for each word.
• ResultProcessor, which dumps the
tweet data obtained by the system to a file us-
ing the required format.
</listItem>
<subsectionHeader confidence="0.982271">
2.3 Adaptation for W-NUT 2015
</subsectionHeader>
<bodyText confidence="0.999913363636364">
In general, the adaptation process revolved around
implementing new processors and integrating new
resources to account for the requirements of this
new task, such as the use of English instead of
Spanish on the new I/O data format, while leaving
the base structure of the system untouched. This
was precisely the main goal during the refactoring
process at the beginning of this project.
The resulting configuration includes the follow-
ing new processors (see Section 3 for a description
of the resources they use):
</bodyText>
<listItem confidence="0.995966714285714">
• WNUTTweetProcessor, which parses the
structured input (now in JSON format instead
of plain text) and obtains the system repre-
sentation of the tweets.
• ArkTweetProcessor, which uses the
ark-tweet-nlp POS tagger to obtain the
morphosyntactic information of the input
tweet tokens.
• WNUTFilterProcessor, which filters
out all those terms that should not be normal-
ized according to the task rules (mentions,
hashtags, URLs, etc.) using regular expres-
sions.
•
</listItem>
<footnote confidence="0.991886">
2http://wikimediafoundation.org
</footnote>
<page confidence="0.991146">
101
</page>
<listItem confidence="0.997459">
• LowerCaseProcessor, which takes
all the candidate forms of a token and
lowercases them; AspellCProcessor,
a constrained version of the original
AspellProcessor described in Sec-
tion 2.2 (see Section 3 for further details).
• WNUTNgramProcessor, which is similar
to the previous NGramProcessor but with
some added modifications to fit the particu-
larities of our new custom language model.
• WNUTResultProcessor, which dumps
all tweet data generated by the system in the
required output format (JSON).
</listItem>
<bodyText confidence="0.999840166666667">
We show in Figure 1 a graphical representation
of the architecture of the system both before (left
side) and after (right side) the adaptation.
Unfortunately, time limitations prevented us
from implementing an English phonetic table for
the PhoneticProcessor, which would have
provided us with mappings such as “two”, “too”
or “to” for “2”. To alleviate this, we did extend the
SMS dictionary to cover some of these cases.
It should be noted that because of those limita-
tions we did not address those cases were multi-
ple contiguous tokens of the input tweet should be
normalized into a single output token (i.e. the so
called “n-1 mappings”). Moreover, since that phe-
nomenon was rare (it appeared in just 11 tweets
out of 2950 of the training dataset) we considered
that leaving this feature behind would have little
impact on the final performance of the system.
</bodyText>
<sectionHeader confidence="0.999393" genericHeader="method">
3 Integrated resources
</sectionHeader>
<bodyText confidence="0.980085333333333">
The base resources we have used for this task, and
on which most of the system processors rely, are
the following:
</bodyText>
<listItem confidence="0.975187615384615">
• aspell (Aspell, 2011), the well-known
spell-checker together with its default En-
glish dictionary.
• ark-tweet-nlp (Owoputi et al., 2013), a
Twitter-focused NLP toolkit from which we
have used its POS tagger.
• BerkeleyLM (Pauls and Klein, 2011), a
Java library and toolset focused on language
modeling.
• Redis,3 a noSQL key-value datastore; and
the SMS normalization dictionaries, canoni-
cal lexicon and training dataset provided by
the organizers of the task.
</listItem>
<bodyText confidence="0.6297935">
As a result of processing the previous resources,
we have obtained the following additional ones:
</bodyText>
<listItem confidence="0.98182875">
• A global SMS normalization dictionary im-
plemented as a Redis datastore, whose en-
tries were extracted from the two normaliza-
tion dictionaries and the training dataset pro-
vided by the organizers.
• A Kneser-Ney language model (Kneser and
Ney, 1995) of the target domain (standard
tweet text) obtained with the BerkeleyLM
tools taking as input tweets of the training
dataset.
• A new English dictionary for aspell built
on the canonical lexicon.
</listItem>
<bodyText confidence="0.999921052631579">
With respect to the differences existing between
the configurations of the system for constrained
and unconstrained runs, there is only one. In
the case of the constrained run, since only off-
the-shelf tools are permitted, the aspell spell-
checker was employed using its default dictionary
but filtering its retrieved candidate corrections tak-
ing as reference the canonical lexicon; i.e. only
those candidates that could be found on this lexi-
con were taken into account. On the other hand,
in the case of the unconstrained run, aspell was
used instead with the dictionary obtained from the
canonical lexicon. The rest of the processors and
their parameters remained the same.
Moreover, although we also considered the use
of the Web 1T 5-gram v1 language model in the
unconstrained run, our preliminary tests showed
that the results obtained were very poor in this
case, as we further comment in Section 4.
</bodyText>
<sectionHeader confidence="0.999175" genericHeader="evaluation">
4 Evaluation
</sectionHeader>
<bodyText confidence="0.999749777777778">
Table 1 shows the results obtained for the train-
ing corpus. It should be noted that these corre-
spond to a slightly overfitted system, since we in-
advertently used a language model built using the
whole training dataset (for candidate selection) in
our 10-fold cross-validation framework. Never-
theless, this also gave us an interesting clue to the
main performance bottleneck of our system, as we
will discuss below.
</bodyText>
<page confidence="0.997621">
102
</page>
<figureCaption confidence="0.9992025">
Figure 1: Original pipeline (left) and pipeline adapted for W-NUT 2015 (right) integrated into the archi-
tecture of the system.
</figureCaption>
<table confidence="0.998492">
precision recall F1
constrained 0.8956 0.8746 0.8850
unconstrained 0.8914 0.8739 0.8825
</table>
<tableCaption confidence="0.996004">
Table 1: Training results.
</tableCaption>
<table confidence="0.999813666666667">
precision recall F1
constrained 0.4646 0.6281 0.5341
unconstrained 0.4592 0.6296 0.5310
</table>
<tableCaption confidence="0.999516">
Table 2: Testing results.
</tableCaption>
<bodyText confidence="0.89692475">
Table 2 shows the results obtained for the test
corpus. At the sight of these figures, which differ
considerably from the previous ones, we decided
to analyse them in more detail. For this purpose,
</bodyText>
<footnote confidence="0.804014">
3http://redis.io/
</footnote>
<bodyText confidence="0.999827882352941">
we obtained a recall metric on the scope of the can-
didates proposed by the system; in other words,
we wanted to see how many times the correct can-
didate corresponding to a token of the dataset was
among the ones considered by the system. The
resulting ratio came to 0.87, which means that
most of the times we had had the chance to select
the correct normalization form for a given non-
standard token but the system failed to make the
selection, and is also a consistent figure with re-
spect to those shown on Table 1. This was not a
big surprise for us, mainly because it is a well-
known problem we have been aware of since we
started working on (Spanish) tweet normalization.
Therefore, we can conclude that the performance
bottleneck of our system is still the candidate se-
lection process, which is heavily influenced by the
</bodyText>
<page confidence="0.998582">
103
</page>
<bodyText confidence="0.999060416666667">
language model in use.
In this respect, tuning experiments were also
made by extending our unconstrained configura-
tion through the addition of the Web 1T 5-gram v1
English language model as a knowledge source.
Only unigrams and bigrams could be used be-
cause of unsolved memory limitations. However,
in contrast with previous experiments performed
for Spanish, the resulting performance was unsat-
isfactory. Because of this, the use of these lan-
guage models for our final submission was dis-
missed. According to our analysis, the cause for
this seems to be the great differences, at both the
lexical and syntactical levels, between the texts
used to build this model, which could be con-
sidered as “regular” texts, and those correspond-
ing to tweets, which agrees with the observations
of Chrupała (2014). As illustrative examples of
this type of expressions we can take “I like them
girls” and “Why you no do that?”, which are lex-
ically correct but not syntactically valid, so lan-
guage models built using regular texts will not rec-
ognize them. In the case of our previous experi-
ments on Spanish, this difference was not so clear.
</bodyText>
<sectionHeader confidence="0.992629" genericHeader="conclusions">
5 Conclusions and Future work
</sectionHeader>
<bodyText confidence="0.93569146875">
We have presented in this work the tweet normal-
ization system used by our group to participate in
the W-NUT 2015 Normalization Task which, in
turn, is an adaptation of another existing Spanish
tweet normalization system.
Within the scope of this task, it became clear
that most of the normalization mistakes made by
our system occurred during the candidate selec-
tion stage, as it was unable to determine the correct
normalization term obtained in previous stages
from the set of candidates available. The reason
for it is that we do not have at this very moment
enough training data to build a representative lan-
guage model of the target domain (normalized text
of English tweets).
Furthermore, there is another type of normal-
ization phenomena which, at this moment, can-
not be correctly handled by our system: n-1 map-
pings. This is due to the initial approach we took
for this system, which only considered 1-1 and 1-
n mappings, but not n-1 mappings, together with
our time limitations.
All that being said, as future lines of work we
are considering the following improvements to our
system:
• Obtaining a representative language model of
the target domain by using a larger normal-
ized tweet corpus. This corpus will be com-
prised of tweets without non-standard words,
so we can still capture the morphosyntactic
structure of these texts (Yang and Eisenstein,
2013).
</bodyText>
<listItem confidence="0.6449675">
• Using POS tags and syntactic information to
improve the candidate selection process.
• Integrating a classifier in the extraction pro-
cess of the final normalization candidates,
taking as features aspects such as the syn-
tactic and morphosyntactic information ob-
tained, their probability according to the lan-
guage model, whether they were selected or
not by the Viterbi algorithm, their string and
phonetic differences with respect to the orig-
inal form, etc.
• Keeping the canonical lexicon updated using
resources like Wikipedia, since the language
model construction process relies heavily
upon a good lexical reference in order to cor-
rectly discard non-standard words.
</listItem>
<bodyText confidence="0.999915">
Moreover, we intend to study the application of
tweet normalization, for both Spanish and English
tweets, in opinion mining tasks (Vilares et al.,
2015).
</bodyText>
<sectionHeader confidence="0.998631" genericHeader="acknowledgments">
Acknowledgments
</sectionHeader>
<bodyText confidence="0.9998758">
This research has been partially funded by the
Spanish Ministry of Economy and Competitive-
ness and FEDER (through project FFI2014-51978-
C2-2-R) and by the Autonomous Government of
Galicia (through grant R2014/034).
</bodyText>
<sectionHeader confidence="0.998846" genericHeader="references">
References
</sectionHeader>
<reference confidence="0.989631461538462">
I˜naki Alegr´ıa, Nora Aranberri, V´ıctor Fresno, Pablo
Gamallo, Llu´ıs Padr´o, I˜naki San Vicente, Jordi
Turmo, and Arkaitz Zubiaga. 2013. Introducci´on
a la tarea compartida Tweet-Norm 2013: Normal-
izaci´on l´exica de tuits en espa˜nol. In Tweet-Norm
2013. Tweet Normalization Workshop 2013, volume
1086 of CEUR Workshop Proceedings, pages 1–9.
CEUR-WS.org.
GNU ASPELL (rel. 0.60). 2011. Available at: http:
//aspell.net (visited on May 2015).
Timothy Baldwin, Marie Catherine de Marneffe,
Bo Han, Young-Bum Kim, Alan Ritter, and Wei
Xu. 2015. Shared tasks of the 2015 workshop on
</reference>
<page confidence="0.994068">
104
</page>
<reference confidence="0.99488525">
noisy user-generated text: Twitter lexical normal-
ization and named entity recognition. In Proceed-
ings of the Workshop on Noisy User-generated Text
(WNUT 2015), Beijing, China.
Thorsten Brants and Alex Franz. 2006. Web 1T 5-
gram Version 1 (ref. LDC2006T13). DVD. Dis-
tributed by Linguistic Data Consortium.
Grzegorz Chrupała. 2014. Normalizing tweets with
edit scripts and recurrent neural embeddings. In
Proc. of the 52nd Annual Meeting of the Associa-
tion for Computational Linguistics (Volume 2: Short
Papers), pages 680–686. ACL.
Bo Han and Timothy Baldwin. 2011. Lexical normal-
isation of short text messages: makn sens a #twitter.
In Proc. of the 49th Annual Meeting of the Associ-
ation for Computational Linguistics: Human Lan-
guage Technologies (ACL-HLT 2011) - Volume 1,
pages 368–378. ACL.
Bo Han, Paul Cook, and Timothy Baldwin. 2013.
Lexical normalization for social media text. ACM
Transactions on Intelligent Systems and Technology
(TIST), 4(1):5:1–5:27.
Reinhard Kneser and Hermann Ney. 1995. Im-
proved backing-off for M-gram language modeling.
In Proc. of the IEEE International Conference on
Acoustics, Speech and Signal Processing (ICASSP-
95), volume 1, pages 181–184. IEEE.
Catherine Kobus, Franc¸ois Yvon, and G´eraldine
Damnati. 2008. Normalizing SMS: Are Two
Metaphors Better Than One? In Proc. of the
22nd International Conference on Computational
Linguistics (COLING’08) - Volume 1, pages 441–
448. ACL.
Paula L´opez R´ua. 2007. Teaching L2 vocabulary
through SMS language: Some didactic guidelines.
Estudios de ling¨uistica inglesa aplicada, 7:165–188.
Christopher D. Manning and Hinrich Sch¨utze. 1999.
Foundations of Statistical Natural Language Pro-
cessing. The MIT Press, Cambridge (Mas-
sachusetts) and London (England).
Olutobi Owoputi, Brendan O’Connor, Chris Dyer,
Kevin Gimpel, Nathan Schneider, and Noah A.
Smith. 2013. Improved part-of-speech tagging
for online conversational text with word clusters.
In Proc. of the 2013 Conference of the North
American Chapter of the Association for Computa-
tional Linguistics: Human Language Technologies
(NAACL-HLT 2013), pages 380–390. ACL. Toolkit
available at: http://www.ark.cs.cmu.edu/
TweetNLP/ (visited on May 2015).
Llu´ıs Padr´o and Evgeny Stanilovsky. 2012. Freeling
3.0: Towards Wider Multilinguality. In Proceed-
ings of the Eight International Conference on Lan-
guage Resources and Evaluation (LREC’12). Eu-
ropean Language Resources Association (ELRA).
Toolkit available at: http://nlp.lsi.upc.
edu/freeling/ (visited on May 2015).
Adam Pauls and Dan Klein. 2011. Faster and Smaller
N-gram Language Models. In Proc. of the 49th
Annual Meeting of the Association for Computa-
tional Linguistics: Human Language Technologies
(ACL-HLT 2011) - Volume 1, pages 258–267. ACL.
BerkeleyLM source code available at https://
code.google.com/p/berkeleylm/ (visited
on May 2015).
Lawrence Philips. 1990. Hanging on the metaphone.
Computer Language, 7(12):39–43.
Jes´us Vilares, Miguel A. Alonso, and David Vilares.
2013. Prototipado r´apido de un sistema de nor-
malizaci´on de tuits: Una aproximaci´on l´exica. In
Tweet-Norm 2013. Tweet Normalization Workshop
2013, volume 1086 of CEUR Workshop Proceed-
ings, pages 39–43. CEUR-WS.org.
David Vilares, Miguel A. Alonso, and Carlos G´omez-
Rodr´ıguez. 2015. On the usefulness of lexical
and syntactic processing in polarity classification of
Twitter messages. Accepted for publication in Jour-
nal of the Association for Information Science and
Technology (JASIST). DOI 10.1002/asi.23284.
Yi Yang and Jacob Eisenstein. 2013. A Log-Linear
Model for Unsupervised Text Normalization. In
Proc. of the 2013 Conference on Empirical Methods
in Natural Language Processing (EMNLP 2013),
pages 61–72. ACL.
</reference>
<page confidence="0.999015">
105
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.214808">
<title confidence="0.7046095">LYSGROUP: Adapting a Spanish microtext normalization system to English.</title>
<author confidence="0.307134">Yerai Doval</author>
<author confidence="0.307134">Jes´us Vilares</author>
<author confidence="0.307134">Carlos</author>
<affiliation confidence="0.323635">LYS group, Departamento de Computaci´on, Facultade de</affiliation>
<address confidence="0.389531">da Campus de 15071 A</address>
<email confidence="0.813794">jvilares,</email>
<abstract confidence="0.998539058823529">In this article we describe the microtext normalization system we have used to participate in the Normalization of Noisy Text Task of the ACL W-NUT 2015 Workshop. Our normalization system was originally developed for text mining tasks on Spanish tweets. Our main goals during its development were flexibility, scalability and maintainability, in order to test a wide variety of approximations to the problem at hand with minimum effort. We will pay special attention to the process of adapting the components of our system to deal with English tweets which, as we will show, was achieved without major modifications of its base structure.</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="true">
<authors>
<author>I˜naki Alegr´ıa</author>
<author>Nora Aranberri</author>
<author>V´ıctor Fresno</author>
<author>Pablo Gamallo</author>
<author>Llu´ıs Padr´o</author>
<author>I˜naki San Vicente</author>
<author>Jordi Turmo</author>
<author>Arkaitz Zubiaga</author>
</authors>
<title>Introducci´on a la tarea compartida Tweet-Norm 2013: Normalizaci´on l´exica de tuits en espa˜nol.</title>
<date>2013</date>
<booktitle>In Tweet-Norm 2013. Tweet Normalization Workshop</booktitle>
<volume>1086</volume>
<pages>1--9</pages>
<marker>Alegr´ıa, Aranberri, Fresno, Gamallo, Padr´o, Vicente, Turmo, Zubiaga, 2013</marker>
<rawString>I˜naki Alegr´ıa, Nora Aranberri, V´ıctor Fresno, Pablo Gamallo, Llu´ıs Padr´o, I˜naki San Vicente, Jordi Turmo, and Arkaitz Zubiaga. 2013. Introducci´on a la tarea compartida Tweet-Norm 2013: Normalizaci´on l´exica de tuits en espa˜nol. In Tweet-Norm 2013. Tweet Normalization Workshop 2013, volume 1086 of CEUR Workshop Proceedings, pages 1–9. CEUR-WS.org.</rawString>
</citation>
<citation valid="true">
<title>Available at: http: //aspell.net (visited on</title>
<date>2011</date>
<contexts>
<context position="3580" citStr="(2011)" startWordPosition="563" endWordPosition="563"> tweet normalization system was developed taking as basic premises its flexibility, scalability and maintainability. As a starting point, we took a previous prototype for Spanish tweet normalization (Vilares et al., 2013) which, although fully functional, did not turn out to be as flexible and maintainable as expected. This could have become a problem for future developments, since the adaptation effort needed to integrate new techniques would have been too large, so we decided to refactor the whole system to solve this. The general scheme of the original system mimics that of Han and Baldwin (2011) and comprises three stages: 1. Tweet preprocessing. 2. In-vocabulary word identification (IV), based on the lexicon of the system, obtaining as 99 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 99–105, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics a result an initial set of out-of-vocabulary words (OOV). 3. OOV set processing in order to distinguish between correct words which are out of the system lexicon and proper lexical variants, obtaining for each one of the latter a normalized form. This last step can be in turn decomposed int</context>
</contexts>
<marker>2011</marker>
<rawString>GNU ASPELL (rel. 0.60). 2011. Available at: http: //aspell.net (visited on May 2015).</rawString>
</citation>
<citation valid="true">
<authors>
<author>Timothy Baldwin</author>
<author>Marie Catherine de Marneffe</author>
<author>Bo Han</author>
<author>Young-Bum Kim</author>
<author>Alan Ritter</author>
<author>Wei Xu</author>
</authors>
<title>Shared tasks of the 2015 workshop on noisy user-generated text: Twitter lexical normalization and named entity recognition.</title>
<date>2015</date>
<booktitle>In Proceedings of the Workshop on Noisy User-generated Text (WNUT 2015),</booktitle>
<location>Beijing, China.</location>
<marker>Baldwin, de Marneffe, Han, Kim, Ritter, Xu, 2015</marker>
<rawString>Timothy Baldwin, Marie Catherine de Marneffe, Bo Han, Young-Bum Kim, Alan Ritter, and Wei Xu. 2015. Shared tasks of the 2015 workshop on noisy user-generated text: Twitter lexical normalization and named entity recognition. In Proceedings of the Workshop on Noisy User-generated Text (WNUT 2015), Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Thorsten Brants</author>
<author>Alex Franz</author>
</authors>
<date>2006</date>
<booktitle>Web 1T 5-gram Version 1 (ref. LDC2006T13). DVD. Distributed by Linguistic Data Consortium.</booktitle>
<contexts>
<context position="10793" citStr="Brants and Franz, 2006" startWordPosition="1703" endWordPosition="1706">d with a new phonetic table for Spanish, based 1The character “x” resembles the multiplication (times) sign ×, which in Spanish is read as “por”. AffixESProcessor, which identifies and normalizes affix-derived Spanish forms of base words, also supporting phonetical writing, as in the case of “chikiyo” for “chiquillo” (little boy), obtained from “chico” with the suffix “-illo” (little/small). • NGramProcessor, which calculates the scores of those most likely normalization candidates according to the Viterbi algorithm (Manning and Sch¨utze, 1999, Ch. 9) taking as reference the Web 1T 5-gram v1 (Brants and Franz, 2006) Spanish language model. • CandidateProcessor, which selects the top-scoring candidate for each word. • ResultProcessor, which dumps the tweet data obtained by the system to a file using the required format. 2.3 Adaptation for W-NUT 2015 In general, the adaptation process revolved around implementing new processors and integrating new resources to account for the requirements of this new task, such as the use of English instead of Spanish on the new I/O data format, while leaving the base structure of the system untouched. This was precisely the main goal during the refactoring process at the </context>
</contexts>
<marker>Brants, Franz, 2006</marker>
<rawString>Thorsten Brants and Alex Franz. 2006. Web 1T 5-gram Version 1 (ref. LDC2006T13). DVD. Distributed by Linguistic Data Consortium.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Grzegorz Chrupała</author>
</authors>
<title>Normalizing tweets with edit scripts and recurrent neural embeddings.</title>
<date>2014</date>
<booktitle>In Proc. of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers),</booktitle>
<pages>680--686</pages>
<publisher>ACL.</publisher>
<contexts>
<context position="18088" citStr="Chrupała (2014)" startWordPosition="2887" endWordPosition="2888">el as a knowledge source. Only unigrams and bigrams could be used because of unsolved memory limitations. However, in contrast with previous experiments performed for Spanish, the resulting performance was unsatisfactory. Because of this, the use of these language models for our final submission was dismissed. According to our analysis, the cause for this seems to be the great differences, at both the lexical and syntactical levels, between the texts used to build this model, which could be considered as “regular” texts, and those corresponding to tweets, which agrees with the observations of Chrupała (2014). As illustrative examples of this type of expressions we can take “I like them girls” and “Why you no do that?”, which are lexically correct but not syntactically valid, so language models built using regular texts will not recognize them. In the case of our previous experiments on Spanish, this difference was not so clear. 5 Conclusions and Future work We have presented in this work the tweet normalization system used by our group to participate in the W-NUT 2015 Normalization Task which, in turn, is an adaptation of another existing Spanish tweet normalization system. Within the scope of th</context>
</contexts>
<marker>Chrupała, 2014</marker>
<rawString>Grzegorz Chrupała. 2014. Normalizing tweets with edit scripts and recurrent neural embeddings. In Proc. of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pages 680–686. ACL.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Bo Han</author>
<author>Timothy Baldwin</author>
</authors>
<title>Lexical normalisation of short text messages: makn sens a #twitter.</title>
<date>2011</date>
<booktitle>In Proc. of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL-HLT 2011) -</booktitle>
<volume>1</volume>
<pages>368--378</pages>
<publisher>ACL.</publisher>
<contexts>
<context position="3580" citStr="Han and Baldwin (2011)" startWordPosition="560" endWordPosition="563">Architecture Our tweet normalization system was developed taking as basic premises its flexibility, scalability and maintainability. As a starting point, we took a previous prototype for Spanish tweet normalization (Vilares et al., 2013) which, although fully functional, did not turn out to be as flexible and maintainable as expected. This could have become a problem for future developments, since the adaptation effort needed to integrate new techniques would have been too large, so we decided to refactor the whole system to solve this. The general scheme of the original system mimics that of Han and Baldwin (2011) and comprises three stages: 1. Tweet preprocessing. 2. In-vocabulary word identification (IV), based on the lexicon of the system, obtaining as 99 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 99–105, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics a result an initial set of out-of-vocabulary words (OOV). 3. OOV set processing in order to distinguish between correct words which are out of the system lexicon and proper lexical variants, obtaining for each one of the latter a normalized form. This last step can be in turn decomposed int</context>
</contexts>
<marker>Han, Baldwin, 2011</marker>
<rawString>Bo Han and Timothy Baldwin. 2011. Lexical normalisation of short text messages: makn sens a #twitter. In Proc. of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL-HLT 2011) - Volume 1, pages 368–378. ACL.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Bo Han</author>
<author>Paul Cook</author>
<author>Timothy Baldwin</author>
</authors>
<title>Lexical normalization for social media text.</title>
<date>2013</date>
<booktitle>ACM Transactions on Intelligent Systems and Technology (TIST),</booktitle>
<pages>4--1</pages>
<contexts>
<context position="2142" citStr="Han et al., 2013" startWordPosition="322" endWordPosition="325">f the message to some extent. To achieve this, most of the techniques applied rely on phonetics, thus being language-specific (L´opez R´ua, 2007). For example: intentionally ignoring orthographic and grammar rules, as in “be like” for “am/is/are/was/were like” in the case of English or “asique” for “asique” in the case of Spanish; the usage of shortenings, contractions and abbreviations such as “c u” for “see you” in English or “ksa” for “casa” in Spanish; or the employment of smileys to express emotions, for instance :) to express happiness. These resulting terms are called lexical variants (Han et al., 2013). The problem is that, in general, text mining tools are very sensitive to those phenomena, as they are designed for dealing with standard texts. Therefore, it is necessary to normalize these texts before their processing, that is, to transform them into standard language. This way “c u nxt week”, for example, would be transformed into “see you next week”. This is the goal of the W-NUT 2015 Normalization Task (Baldwin et al., 2015). The rest of this paper is organized as follows: Section 2 describes the core architecture of our system, and how it was adapted to fit this shared task, and Sectio</context>
</contexts>
<marker>Han, Cook, Baldwin, 2013</marker>
<rawString>Bo Han, Paul Cook, and Timothy Baldwin. 2013. Lexical normalization for social media text. ACM Transactions on Intelligent Systems and Technology (TIST), 4(1):5:1–5:27.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Reinhard Kneser</author>
<author>Hermann Ney</author>
</authors>
<title>Improved backing-off for M-gram language modeling.</title>
<date>1995</date>
<booktitle>In Proc. of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP95),</booktitle>
<volume>1</volume>
<pages>181--184</pages>
<publisher>IEEE.</publisher>
<contexts>
<context position="14354" citStr="Kneser and Ney, 1995" startWordPosition="2267" endWordPosition="2270"> used its POS tagger. • BerkeleyLM (Pauls and Klein, 2011), a Java library and toolset focused on language modeling. • Redis,3 a noSQL key-value datastore; and the SMS normalization dictionaries, canonical lexicon and training dataset provided by the organizers of the task. As a result of processing the previous resources, we have obtained the following additional ones: • A global SMS normalization dictionary implemented as a Redis datastore, whose entries were extracted from the two normalization dictionaries and the training dataset provided by the organizers. • A Kneser-Ney language model (Kneser and Ney, 1995) of the target domain (standard tweet text) obtained with the BerkeleyLM tools taking as input tweets of the training dataset. • A new English dictionary for aspell built on the canonical lexicon. With respect to the differences existing between the configurations of the system for constrained and unconstrained runs, there is only one. In the case of the constrained run, since only offthe-shelf tools are permitted, the aspell spellchecker was employed using its default dictionary but filtering its retrieved candidate corrections taking as reference the canonical lexicon; i.e. only those candid</context>
</contexts>
<marker>Kneser, Ney, 1995</marker>
<rawString>Reinhard Kneser and Hermann Ney. 1995. Improved backing-off for M-gram language modeling. In Proc. of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP95), volume 1, pages 181–184. IEEE.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Catherine Kobus</author>
<author>Franc¸ois Yvon</author>
<author>G´eraldine Damnati</author>
</authors>
<title>Normalizing SMS: Are Two Metaphors Better Than One?</title>
<date>2008</date>
<booktitle>In Proc. of the 22nd International Conference on Computational Linguistics (COLING’08) -</booktitle>
<volume>1</volume>
<pages>441--448</pages>
<publisher>ACL.</publisher>
<contexts>
<context position="4636" citStr="Kobus et al., 2008" startWordPosition="736" endWordPosition="739">ich are out of the system lexicon and proper lexical variants, obtaining for each one of the latter a normalized form. This last step can be in turn decomposed into two: the first one, which generates a set of possible normalization candidates based on the application of certain normalization techniques; and the second one, which selects one of these candidates as the normalized form (in our case, in a scoredriven process). As for the particular normalization techniques employed throughout our system, we decided to try first a combination of two of the traditional approximations to this task (Kobus et al., 2008): the spell checking and the automatic speech recognition metaphors. 2.1 The pipeline We decided to give our system an object oriented approach (using JAVA) as opposed to the imperative approach of the original prototype (in PERL). The new system is structured in processors, formerly known as modules in the prototype, whose goal is to apply a certain process to the input tweets so that we can obtain the normalization candidates of their terms at its output. The core component of our system is the pipeline, consisting of a classic cascade structure where we can insert an arbitrary number of pro</context>
</contexts>
<marker>Kobus, Yvon, Damnati, 2008</marker>
<rawString>Catherine Kobus, Franc¸ois Yvon, and G´eraldine Damnati. 2008. Normalizing SMS: Are Two Metaphors Better Than One? In Proc. of the 22nd International Conference on Computational Linguistics (COLING’08) - Volume 1, pages 441– 448. ACL.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Paula L´opez R´ua</author>
</authors>
<title>Teaching L2 vocabulary through SMS language: Some didactic guidelines. Estudios de ling¨uistica inglesa aplicada,</title>
<date>2007</date>
<pages>7--165</pages>
<marker>R´ua, 2007</marker>
<rawString>Paula L´opez R´ua. 2007. Teaching L2 vocabulary through SMS language: Some didactic guidelines. Estudios de ling¨uistica inglesa aplicada, 7:165–188.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Christopher D Manning</author>
<author>Hinrich Sch¨utze</author>
</authors>
<title>Foundations of Statistical Natural Language Processing.</title>
<date>1999</date>
<publisher>The MIT Press,</publisher>
<location>Cambridge (Massachusetts) and London (England).</location>
<marker>Manning, Sch¨utze, 1999</marker>
<rawString>Christopher D. Manning and Hinrich Sch¨utze. 1999. Foundations of Statistical Natural Language Processing. The MIT Press, Cambridge (Massachusetts) and London (England).</rawString>
</citation>
<citation valid="false">
<authors>
<author>Olutobi Owoputi</author>
<author>Brendan O’Connor</author>
<author>Chris Dyer</author>
<author>Kevin Gimpel</author>
<author>Nathan Schneider</author>
<author>Noah A Smith</author>
</authors>
<title>Improved part-of-speech tagging for online conversational text with word clusters.</title>
<date>2013</date>
<booktitle>In Proc. of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT</booktitle>
<pages>380--390</pages>
<note>ACL. Toolkit available at: http://www.ark.cs.cmu.edu/ TweetNLP/ (visited on</note>
<marker>Owoputi, O’Connor, Dyer, Gimpel, Schneider, Smith, 2013</marker>
<rawString>Olutobi Owoputi, Brendan O’Connor, Chris Dyer, Kevin Gimpel, Nathan Schneider, and Noah A. Smith. 2013. Improved part-of-speech tagging for online conversational text with word clusters. In Proc. of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2013), pages 380–390. ACL. Toolkit available at: http://www.ark.cs.cmu.edu/ TweetNLP/ (visited on May 2015).</rawString>
</citation>
<citation valid="true">
<authors>
<author>Llu´ıs Padr´o</author>
<author>Evgeny Stanilovsky</author>
</authors>
<title>Freeling 3.0: Towards Wider Multilinguality.</title>
<date>2012</date>
<booktitle>In Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC’12). European Language Resources Association (ELRA). Toolkit</booktitle>
<note>available at: http://nlp.lsi.upc. edu/freeling/ (visited on</note>
<marker>Padr´o, Stanilovsky, 2012</marker>
<rawString>Llu´ıs Padr´o and Evgeny Stanilovsky. 2012. Freeling 3.0: Towards Wider Multilinguality. In Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC’12). European Language Resources Association (ELRA). Toolkit available at: http://nlp.lsi.upc. edu/freeling/ (visited on May 2015).</rawString>
</citation>
<citation valid="true">
<authors>
<author>Adam Pauls</author>
<author>Dan Klein</author>
</authors>
<title>Faster and Smaller N-gram Language Models.</title>
<date>2011</date>
<booktitle>In Proc. of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL-HLT 2011) -</booktitle>
<volume>1</volume>
<pages>258--267</pages>
<note>ACL. BerkeleyLM source code available at https:// code.google.com/p/berkeleylm/ (visited on</note>
<contexts>
<context position="13791" citStr="Pauls and Klein, 2011" startWordPosition="2178" endWordPosition="2181">“n-1 mappings”). Moreover, since that phenomenon was rare (it appeared in just 11 tweets out of 2950 of the training dataset) we considered that leaving this feature behind would have little impact on the final performance of the system. 3 Integrated resources The base resources we have used for this task, and on which most of the system processors rely, are the following: • aspell (Aspell, 2011), the well-known spell-checker together with its default English dictionary. • ark-tweet-nlp (Owoputi et al., 2013), a Twitter-focused NLP toolkit from which we have used its POS tagger. • BerkeleyLM (Pauls and Klein, 2011), a Java library and toolset focused on language modeling. • Redis,3 a noSQL key-value datastore; and the SMS normalization dictionaries, canonical lexicon and training dataset provided by the organizers of the task. As a result of processing the previous resources, we have obtained the following additional ones: • A global SMS normalization dictionary implemented as a Redis datastore, whose entries were extracted from the two normalization dictionaries and the training dataset provided by the organizers. • A Kneser-Ney language model (Kneser and Ney, 1995) of the target domain (standard tweet</context>
</contexts>
<marker>Pauls, Klein, 2011</marker>
<rawString>Adam Pauls and Dan Klein. 2011. Faster and Smaller N-gram Language Models. In Proc. of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL-HLT 2011) - Volume 1, pages 258–267. ACL. BerkeleyLM source code available at https:// code.google.com/p/berkeleylm/ (visited on May 2015).</rawString>
</citation>
<citation valid="true">
<authors>
<author>Lawrence Philips</author>
</authors>
<title>Hanging on the metaphone.</title>
<date>1990</date>
<journal>Computer Language,</journal>
<volume>7</volume>
<issue>12</issue>
<contexts>
<context position="8386" citStr="Philips, 1990" startWordPosition="1349" endWordPosition="1350">ynamic configuration subsystem based on XML files that allows us to define and instantiate the particular structure of the pipeline on which we want to process the tweets. The advantages of such a subsystem are clear, both for system maintainability and testing: 1. It improves the multilingual support of the system by enabling the definition of configurations that use processors and resources designed for a particular language. 2. It allows for experimentation in a simple, agile and documented (the configuration file itself also serves as documentation) manner. 100 on the Metaphone algorithm (Philips, 1990) and a new Spanish dictionary extracted from Wikimedia resources.2 3. It avoids the necessity of modifying the system source code. 2.2 Configuration before W-NUT 2015 The current processor configuration for Spanish tweet normalization derives from that one used by the initial prototype for its participation in the TweetNorm 2013 task (Alegria et al., 2013). The general procedure works like this: firstly, using processors to prepare the input (preprocessing); secondly, employing those whose purpose is to obtain new normalization forms (candidates generation); thirdly, using those in charge of s</context>
</contexts>
<marker>Philips, 1990</marker>
<rawString>Lawrence Philips. 1990. Hanging on the metaphone. Computer Language, 7(12):39–43.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Jes´us Vilares</author>
<author>Miguel A Alonso</author>
<author>David Vilares</author>
</authors>
<title>Prototipado r´apido de un sistema de normalizaci´on de tuits: Una aproximaci´on l´exica.</title>
<date>2013</date>
<booktitle>In Tweet-Norm 2013. Tweet Normalization Workshop 2013, volume 1086 of CEUR Workshop Proceedings,</booktitle>
<pages>39--43</pages>
<contexts>
<context position="3195" citStr="Vilares et al., 2013" startWordPosition="493" endWordPosition="496">15). The rest of this paper is organized as follows: Section 2 describes the core architecture of our system, and how it was adapted to fit this shared task, and Section 3 presents the resources used. Next, Section 4 evaluates the system and discusses the results obtained. Finally, Section 5 presents our conclusions and considers some possible future improvements for our system. 2 Architecture Our tweet normalization system was developed taking as basic premises its flexibility, scalability and maintainability. As a starting point, we took a previous prototype for Spanish tweet normalization (Vilares et al., 2013) which, although fully functional, did not turn out to be as flexible and maintainable as expected. This could have become a problem for future developments, since the adaptation effort needed to integrate new techniques would have been too large, so we decided to refactor the whole system to solve this. The general scheme of the original system mimics that of Han and Baldwin (2011) and comprises three stages: 1. Tweet preprocessing. 2. In-vocabulary word identification (IV), based on the lexicon of the system, obtaining as 99 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, </context>
<context position="7514" citStr="Vilares et al., 2013" startWordPosition="1205" endWordPosition="1208">terest in multilingual environments. It should also be possible to integrate them into other external components, so that their logic can be reused by others. All this involves decoupling the processors from the specific implementations of the external components employed, which we have achieved through the use of the inversion of control pattern. Furthermore, communication between the components of the pipeline is done through structured text files, allowing us to gain flexibility as we can integrate and exchange with ease new processing modules regardless of their particular implementation (Vilares et al., 2013). In this case we have used XML along with an implementation of the abstract factory pattern for its construction and parsing. This also facilitates possible future migrations to other data representation languages, such as JSON. Finally, we have created a dynamic configuration subsystem based on XML files that allows us to define and instantiate the particular structure of the pipeline on which we want to process the tweets. The advantages of such a subsystem are clear, both for system maintainability and testing: 1. It improves the multilingual support of the system by enabling the definitio</context>
</contexts>
<marker>Vilares, Alonso, Vilares, 2013</marker>
<rawString>Jes´us Vilares, Miguel A. Alonso, and David Vilares. 2013. Prototipado r´apido de un sistema de normalizaci´on de tuits: Una aproximaci´on l´exica. In Tweet-Norm 2013. Tweet Normalization Workshop 2013, volume 1086 of CEUR Workshop Proceedings, pages 39–43. CEUR-WS.org.</rawString>
</citation>
<citation valid="true">
<authors>
<author>David Vilares</author>
<author>Miguel A Alonso</author>
<author>Carlos G´omezRodr´ıguez</author>
</authors>
<title>On the usefulness of lexical and syntactic processing in polarity classification of Twitter messages. Accepted for publication</title>
<date>2015</date>
<booktitle>in Journal of the Association for Information Science and Technology (JASIST). DOI 10.1002/asi.23284.</booktitle>
<marker>Vilares, Alonso, G´omezRodr´ıguez, 2015</marker>
<rawString>David Vilares, Miguel A. Alonso, and Carlos G´omezRodr´ıguez. 2015. On the usefulness of lexical and syntactic processing in polarity classification of Twitter messages. Accepted for publication in Journal of the Association for Information Science and Technology (JASIST). DOI 10.1002/asi.23284.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Yi Yang</author>
<author>Jacob Eisenstein</author>
</authors>
<title>A Log-Linear Model for Unsupervised Text Normalization.</title>
<date>2013</date>
<booktitle>In Proc. of the 2013 Conference on Empirical Methods in Natural Language Processing (EMNLP 2013),</booktitle>
<pages>61--72</pages>
<publisher>ACL.</publisher>
<contexts>
<context position="19811" citStr="Yang and Eisenstein, 2013" startWordPosition="3179" endWordPosition="3182">ena which, at this moment, cannot be correctly handled by our system: n-1 mappings. This is due to the initial approach we took for this system, which only considered 1-1 and 1- n mappings, but not n-1 mappings, together with our time limitations. All that being said, as future lines of work we are considering the following improvements to our system: • Obtaining a representative language model of the target domain by using a larger normalized tweet corpus. This corpus will be comprised of tweets without non-standard words, so we can still capture the morphosyntactic structure of these texts (Yang and Eisenstein, 2013). • Using POS tags and syntactic information to improve the candidate selection process. • Integrating a classifier in the extraction process of the final normalization candidates, taking as features aspects such as the syntactic and morphosyntactic information obtained, their probability according to the language model, whether they were selected or not by the Viterbi algorithm, their string and phonetic differences with respect to the original form, etc. • Keeping the canonical lexicon updated using resources like Wikipedia, since the language model construction process relies heavily upon a</context>
</contexts>
<marker>Yang, Eisenstein, 2013</marker>
<rawString>Yi Yang and Jacob Eisenstein. 2013. A Log-Linear Model for Unsupervised Text Normalization. In Proc. of the 2013 Conference on Empirical Methods in Natural Language Processing (EMNLP 2013), pages 61–72. ACL.</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>