<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.003645">
<title confidence="0.960477">
IHS_RD: Lexical Normalization for English Tweets
</title>
<author confidence="0.590428">
Dmitry Supranovich
</author>
<note confidence="0.795351">
IHS Inc. / IHS Global Belarus
131 Starovilenskaya St
220123, Minsk, Belarus
</note>
<email confidence="0.923385">
Dmitry.Supranovich@ihs.com
</email>
<sectionHeader confidence="0.99257" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.999943714285714">
This paper describes the Twitter lexical nor-
malization system submitted by IHS R&amp;D
Belarus team for the ACL 2015 workshop on
noisy user-generated text. The proposed sys-
tem consists of two components: a CRF-
based approach to identify possible normali-
zation candidates, and a post-processing step
in an attempt to normalize words that do not
have normalization variants in the lexicon.
Evaluation on the test data set showed that
our unconstrained system achieved the F-
measure of 0.8272 (rank 1 out of 5 submis-
sions for the unconstrained mode, rank 2 out
of all 11 submissions).
</bodyText>
<sectionHeader confidence="0.998797" genericHeader="introduction">
1 Introduction
</sectionHeader>
<bodyText confidence="0.9993176">
Social media texts found in such services as
Twitter or Facebook have a great data-mining
potential, as they offer real-time data that can be
useful to monitor public opinion on brands,
products, events, etc. However, current Natural
Language Processing systems are usually opti-
mized for clean data, which is not the type of
data found in social media texts, as they are often
noisy, containing a lot of slang, typos, and ab-
breviations.
Normalizing such text is challenging. We want
to achieve high recall, making as many correc-
tions as possible, but not at the expense of preci-
sion – words should not be incorrectly normal-
ized.
Previous approaches to this task incorporated
different tools and methods: dictionaries, lan-
guage models, finite state transducers, and ma-
chine translation models. Some of the methods
are unsupervised, though often requiring adjust-
ment of parameters based on annotated data (Han
and Baldwin (2011), Liu et al. (2011), and
Gouws et al. (2011)). Some are supervised, like
that in Chrupała (2014), making use of a Condi-
tional Random Field (Lafferty et al., 2001) to
</bodyText>
<subsectionHeader confidence="0.542137">
Viachaslau Patsepnia
</subsectionHeader>
<bodyText confidence="0.988378777777778">
IHS Inc.
55 Cambridge pkwy, Suite 601
Cambridge, MA 02142, USA
Slava.Patsepnia@ihs.com
learn the sequences of edit operations from la-
belled data.
In this paper, we present an approach based on
the usage of normalization lexicons and a CRF
model for identifying potential candidates.
</bodyText>
<sectionHeader confidence="0.986153" genericHeader="method">
2 Task Description
</sectionHeader>
<subsectionHeader confidence="0.751734">
2.1 Dataset
</subsectionHeader>
<bodyText confidence="0.999712333333333">
The corpus provided by the organizers consists
of 2950 annotated tweets. The annotations follow
these guidelines (Baldwin et al., 2015):
</bodyText>
<listItem confidence="0.980214233333333">
• Non-standard words are normalized to one
or more canonical English words based on
a pre-defined lexicon. For instance, l o v e
should be normalized to love (many-to-one
normalization), tmrw to tomorrow (one-to-
one normalization), and cu to see you
(one-to-many normalization). Additional-
ly, IBM should be left untouched as it is in
the lexicon and it is in its canonical form,
and the informal lol should be expanded to
laughing out loud.
• Non-standard words may be either out-of-
vocabulary (OOV) tokens (e.g., tmrw for
tomorrow) or in-vocabulary (IV) tokens
(e.g., wit for with in “I will come wit
you”).
• Only alphanumeric tokens (e.g., 2, 4eva
and tmrw) and apostrophes used in con-
tractions (e.g., yoou&apos;ve) are considered for
normalization. Tokens including hyphens,
single quotes and other types of contrac-
tions should be ignored.
• Domain specific entities are ignored even
if they are in non-standard forms, e.g.,
#ttyl, @nyc
• It is possible for a tweet to have no non-
standard tokens but still require normaliza-
tion (e.g., the example of wit above), and it
is also possible for the tweet to require no
normalization whatsoever.
</listItem>
<page confidence="0.752154">
78
</page>
<note confidence="0.5161955">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 78–81,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<listItem confidence="0.996911">
• Proper nouns should be left untouched,
even if they are not in the given lexicon
(e.g., Twitter).
• All normalizations should use the Ameri-
can spelling (e.g., tokenize rather than to-
kenise).
</listItem>
<subsectionHeader confidence="0.993021">
2.2 Evaluation
</subsectionHeader>
<bodyText confidence="0.9961725">
Evaluation was to be carried out according to
Precision, Recall, and F1 metrics.
</bodyText>
<sectionHeader confidence="0.996611" genericHeader="method">
3 Experimental Setup
</sectionHeader>
<bodyText confidence="0.99079">
First, a normalization lexicon was generated
from the given training data, enriched with the
data from several sources:
</bodyText>
<listItem confidence="0.867678">
• Word pairs extracted from the datasets
used for lexical normalization (Han, 2011;
Liu, 2011)
• The online social media abbreviation list
of Beal (2015)1. Compared to the previous
workshops with one-to-one normaliza-
tions, the current task also considers one-
to-many normalizations, and obviously not
all abbreviations are present in the training
data, so the use of a list of social media
abbreviations can be vital to the system.
</listItem>
<bodyText confidence="0.999644217391305">
At the current stage of development the sys-
tem is unable to differentiate between several
normalization variants; thus, entries with multi-
ple possible variants were reviewed to make the
most suitable variant first in the list (entries that
are most frequent in datasets are placed first, any
ties were manually reviewed).
Second, a CRF model was trained. The labels
chosen were CAND and NOT_CAND, reflecting
potential normalization candidates and words
that should not be normalized, respectively. The
following features were used:
Token: This feature represents the string of
the current token.
Context Feature: The token to the left and
the token to the right are used as two context fea-
tures. The surrounding words usually convey
useful information about a token which helps in
predicting the correct tag for each token.
Alphanumeric feature: This feature checks
whether the token adheres to the annotation
guidelines and makes sure that non-adhering to-
kens are not marked as potential candidates.
</bodyText>
<footnote confidence="0.938457">
1http://www.webopedia.com/quick_ref/textmessageabbr
eviations.asp
</footnote>
<bodyText confidence="0.998340533333333">
Normalization dictionary feature: This fea-
ture checks whether the token is present in the
generated normalization lexicon.
Canonical lexicon feature: This feature indi-
cates whether or not the token is present in the
canonical lexicon provided by the workshop or-
ganizers.
Word length and number of vowels: Two
separate features as well as their correlation, al-
lowing to tag words with uncommon length-
vowel correlation, like bcz, pls, etc.
Edit distance feature: marks a token that is
within an edit distance of 2 or less from any
word in the canonical lexicon.
Third, the text is normalized:
</bodyText>
<listItem confidence="0.930227">
• All tokens tagged as potential candidates
by the CRF model are normalized to their
lexicon variants.
• All alphanumeric words are normalized to
the American spelling with the VarCon
tool (Atkinson, 2015)2. This includes the
tokens which are already normalized using
the lexicon.
• We have also tried to improve the normal-
ization results by using a did-you-mean
(DYM) module that is currently being de-
veloped at IHS R&amp;D team. The DYM
module corrects user queries/sentences
with misspellings by providing corrected
variant(s) with a confidence measure (in-
cluding no correction variant with the cor-
responding confidence measure). The
</listItem>
<bodyText confidence="0.985329761904762">
DYM module is an SVM model trained on
a set of features for each of the multiple
candidates generated for an input que-
ry/sentence. We used the following fea-
tures: error model score, Levenshtein dis-
tance, language model score, the ratio of
common noun vocabulary words, the ratio
of proper noun vocabulary words, and the
number of changes in non-lowercase
words. An error model score was obtained
from an autocompletion and autocorrec-
tion module (AAM) for which an index
was built from 12.4M documents (scien-
tific papers - 42.1%, Wikipedia articles -
23.5%, patents - 19.4%, social texts - 8%,
and news - 7%). The 2-gram language
model was built from 177K patents (1.36G
words and 2.6M vocabulary). Since we did
not have enough time to tailor both DYM
and AAM modules for social text pro-
cessing, DYM and AAM modules were
</bodyText>
<footnote confidence="0.970017">
2http://wordlist.aspell.net/varcon/
</footnote>
<page confidence="0.999023">
79
</page>
<bodyText confidence="0.999511666666667">
used for this Twitter lexical normalization
system as is, being actually tailored for
technical and scientific texts.
</bodyText>
<subsectionHeader confidence="0.945598">
3.1 Results and error analysis
</subsectionHeader>
<bodyText confidence="0.998506">
Testing was performed on the provided corpus of
1967 tweets.
Table 1 shows the performance of our CRF
candidate model with different features:
</bodyText>
<listItem confidence="0.999919833333334">
• A baseline model with only token, context
and alphanumeric features.
• A baseline model with the normalization
dictionary and the canonical lexicon fea-
tures added.
• A model with all features enabled.
</listItem>
<tableCaption confidence="0.538502666666667">
Table 2 reflects our submitted normalization
result and a result without the DYM module de-
scribed above.
</tableCaption>
<table confidence="0.9999582">
Precision Recall F1
(CRF  |(CRF  |(CRF |
Final) Final) Final)
Tokens + 0.991  |0.57  |0.7237 |
Context + 0.8782 0.6013 0.7139
Alphanumeric
Added diction- 0.907  |0.824  |0.8635 |
ary features 0.8376 0.8133 0.8253
All features 0.915  |0.817  |0.8632 |
0.8469 0.8083 0.8272
</table>
<tableCaption confidence="0.91534475">
Table 1. Result metrics of candidate CRF model
with different features (and its impact on the re-
sult after normalization using a submitted sys-
tem).
</tableCaption>
<table confidence="0.9999455">
Precision Recall F1
Lexicon 0.8469 0.8083 0.8272
Normalization
+ DYM
(submitted)
Lexicon 0.8765 0.7949 0.8337
Normalization
without DYM
</table>
<tableCaption confidence="0.916467">
Table 2. Result metrics of two normalization sys-
tem configurations.
</tableCaption>
<bodyText confidence="0.999558916666667">
The DYM feature does a good job correcting
typos and removing excessive duplicate letters
(beutiful 4 beautiful, tosee 4 to see, and
smileeeeee 4 smile). However, even with a high
confidence threshold, quite a number of words
are normalized excessively, mainly those in non-
English (or partially English) tweets, e.g. jeil 4
jail, hoje 4 hope, and wasan 4 was an, in addi-
tion to some incorrect normalizations like parkd
4 park (instead of parked) or hundread 4 hun-
dreds (instead of hundred). These mistakes are
frequent, and an increase in recall does not out-
weigh a loss in precision; thus, the F-measure
without the DYM feature in its current state is
even a little bit higher than our submitted system
with it. Lowering the confidence threshold brings
more correct normalizations, but due to the na-
ture of tweets even more incorrect ones, leading
to an overall drop in F1 score. Nevertheless, we
decided to use and submit the system with DYM,
since we believe the text normalized this way is
more suitable for further use.
Attempts were made to improve the perfor-
mance of the DYM module as well as to select
the correct candidate from a normalization lexi-
con if there is more than one variant present (ur
4 you’re, your, you). For example, language
detection works well on regular search queries
and could potentially forbid the normalization of
words in non-English tweets. However, it proved
to be not helpful for tweets – the messages are
short, some of them are a mixture of English and
some other language (thus, if there is a normali-
zation restriction on such tweets, potential Eng-
lish normalizations are lost), and slang- and ab-
breviation-rich tweets are hard to analyse. A lan-
guage model was used in an attempt to select a
correct normalization from multiple variants, but
this did not prove to be effective, likely because
the model used was not focused on social media
texts.
We see room for potential improvement in
tuning the DYM tool to social media texts, as
well as in filtering non-English words from nor-
malization candidates, experimenting with lan-
guage models tailored to social media texts and
further enriching the lexicon with new normali-
zation data.
</bodyText>
<sectionHeader confidence="0.999331" genericHeader="conclusions">
4 Conclusion
</sectionHeader>
<bodyText confidence="0.9999561">
In this paper, we presented a system designed for
participation in shared task #2 of the ACL 2015
workshop on noisy user-generated text. Our sys-
tem makes use of CRF for identifying potential
candidates, lexicons to normalize them and a
DYM module as a post-processing step to further
correct some of the misspelled words. Our sys-
tem ranked second among all 11 submissions
with 0.8272 F-measure and ranked first among 5
submissions for the unconstrained mode.
</bodyText>
<page confidence="0.996009">
80
</page>
<sectionHeader confidence="0.99034" genericHeader="references">
References
</sectionHeader>
<reference confidence="0.999889735849057">
Kevin Atkinson. VarCon. Vers. 2015.02.15. Web. 01
Apr. 2015. http://wordlist.aspell.net/varcon/
Timothy Baldwin, Marie-Catherine de Marneffe, Bo
Han, Young-Bum Kim, Alan Ritter, and Wei Xu.
2015. Shared tasks of the 2015 workshop on noisy
user-generated text: Twitter lexical normalization
and named entity recognition. In Proceedings of
the Workshop on Noisy User-generated Text
(WNUT 2015), Beijing, China.
Vangie Beal. Text messaging and online chat abbre-
viations. Web. 01 Apr. 2015.
http://www.webopedia.com/quick_ref/textmessage
abbreviations.asp
Grzegorz Chrupała. 2014. Normalizing tweets with
edit scripts and recurrent neural embeddings. In
Proceedings of the 52nd Annual Meeting of the As-
sociation for Computational Linguistics (ACL
2014), pages 680–686, Baltimore, USA.
Stephan Gouws, Dirk Hovy, and Donald Metzler.
2011. Unsupervised mining of lexical variants from
noisy text. In Proceedings of the First workshop on
Unsupervised Learning in NLP, pages 82–90, Ed-
inburgh, UK.
Bo Han and Timothy Baldwin. 2011. Lexical nor-
malization of short text messages: Makn sens a
#twitter. In Proceedings of the 49th Annual Meet-
ing of the Association for Computational Linguis-
tics (ACL 2011), pages 368-378, Portland, USA.
John D. Lafferty, Andrew McCallum, and Fernando
C. N. Pereira. 2001. Conditional Random Fields:
Probabilistic models for segmenting and labeling
sequence data. In Proceedings of the Eighteenth In-
ternational Conference on Machine Learning,
ICML&apos;01, pages 282–289. Morgan Kaufmann Pub-
lishers Inc., San Francisco, CA, USA
Fei Liu, Fuliang Weng, Bingqing Wang, and Yang
Liu. 2011. Insertion, deletion, or substitution?
Normalizing text messages without pre-
categorization nor supervision. In Proceedings of
the 49th Annual Meeting of the Association for
Computational Linguistics (ACL 2011), pages 71-
76, Portland, USA.
Fei Liu, Fuliang Weng, and Xiao Jiang. 2012. A
broad-coverage normalization system for social
media language. In Proceedings of the 50th Annual
Meeting of the Association for Computational Lin-
guistics (ACL 2012), pages 1035-1044, Jeju Island,
Korea.
Yi Yang and Jacob Eisenstein. 2013. A log-linear
model for unsupervised text normalization. In Pro-
ceedings of Conference on Empirical Methods in
Natura Language Processing (EMNLP), pages 61-
72, Seattle, USA.
</reference>
<page confidence="0.999265">
81
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.796165">
<title confidence="0.998774">IHS_RD: Lexical Normalization for English Tweets</title>
<author confidence="0.932302">Dmitry Supranovich</author>
<affiliation confidence="0.999483">IHS Inc. / IHS Global Belarus</affiliation>
<address confidence="0.985872">131 Starovilenskaya St 220123, Minsk, Belarus</address>
<email confidence="0.99556">Dmitry.Supranovich@ihs.com</email>
<abstract confidence="0.990949933333333">This paper describes the Twitter lexical normalization system submitted by IHS R&amp;D Belarus team for the ACL 2015 workshop on noisy user-generated text. The proposed system consists of two components: a CRFbased approach to identify possible normalization candidates, and a post-processing step in an attempt to normalize words that do not have normalization variants in the lexicon. Evaluation on the test data set showed that our unconstrained system achieved the Fmeasure of 0.8272 (rank 1 out of 5 submissions for the unconstrained mode, rank 2 out of all 11 submissions).</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="true">
<authors>
<author>VarCon Vers</author>
</authors>
<date>2015</date>
<volume>01</volume>
<note>http://wordlist.aspell.net/varcon/</note>
<marker>Vers, 2015</marker>
<rawString>Kevin Atkinson. VarCon. Vers. 2015.02.15. Web. 01 Apr. 2015. http://wordlist.aspell.net/varcon/</rawString>
</citation>
<citation valid="true">
<authors>
<author>Timothy Baldwin</author>
<author>Marie-Catherine de Marneffe</author>
<author>Bo Han</author>
<author>Young-Bum Kim</author>
<author>Alan Ritter</author>
<author>Wei Xu</author>
</authors>
<title>Shared tasks of the 2015 workshop on noisy user-generated text: Twitter lexical normalization and named entity recognition.</title>
<date>2015</date>
<booktitle>In Proceedings of the Workshop on Noisy User-generated Text (WNUT 2015),</booktitle>
<location>Beijing, China.</location>
<marker>Baldwin, de Marneffe, Han, Kim, Ritter, Xu, 2015</marker>
<rawString>Timothy Baldwin, Marie-Catherine de Marneffe, Bo Han, Young-Bum Kim, Alan Ritter, and Wei Xu. 2015. Shared tasks of the 2015 workshop on noisy user-generated text: Twitter lexical normalization and named entity recognition. In Proceedings of the Workshop on Noisy User-generated Text (WNUT 2015), Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Vangie Beal</author>
</authors>
<title>Text messaging and online chat abbreviations.</title>
<date>2015</date>
<journal>Web.</journal>
<volume>01</volume>
<note>http://www.webopedia.com/quick_ref/textmessage abbreviations.asp</note>
<contexts>
<context position="4218" citStr="Beal (2015)" startWordPosition="676" endWordPosition="677">r Computational Linguistics • Proper nouns should be left untouched, even if they are not in the given lexicon (e.g., Twitter). • All normalizations should use the American spelling (e.g., tokenize rather than tokenise). 2.2 Evaluation Evaluation was to be carried out according to Precision, Recall, and F1 metrics. 3 Experimental Setup First, a normalization lexicon was generated from the given training data, enriched with the data from several sources: • Word pairs extracted from the datasets used for lexical normalization (Han, 2011; Liu, 2011) • The online social media abbreviation list of Beal (2015)1. Compared to the previous workshops with one-to-one normalizations, the current task also considers oneto-many normalizations, and obviously not all abbreviations are present in the training data, so the use of a list of social media abbreviations can be vital to the system. At the current stage of development the system is unable to differentiate between several normalization variants; thus, entries with multiple possible variants were reviewed to make the most suitable variant first in the list (entries that are most frequent in datasets are placed first, any ties were manually reviewed). </context>
</contexts>
<marker>Beal, 2015</marker>
<rawString>Vangie Beal. Text messaging and online chat abbreviations. Web. 01 Apr. 2015. http://www.webopedia.com/quick_ref/textmessage abbreviations.asp</rawString>
</citation>
<citation valid="true">
<authors>
<author>Grzegorz Chrupała</author>
</authors>
<title>Normalizing tweets with edit scripts and recurrent neural embeddings.</title>
<date>2014</date>
<booktitle>In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (ACL 2014),</booktitle>
<pages>680--686</pages>
<location>Baltimore, USA.</location>
<contexts>
<context position="1793" citStr="Chrupała (2014)" startWordPosition="285" endWordPosition="286">ng, typos, and abbreviations. Normalizing such text is challenging. We want to achieve high recall, making as many corrections as possible, but not at the expense of precision – words should not be incorrectly normalized. Previous approaches to this task incorporated different tools and methods: dictionaries, language models, finite state transducers, and machine translation models. Some of the methods are unsupervised, though often requiring adjustment of parameters based on annotated data (Han and Baldwin (2011), Liu et al. (2011), and Gouws et al. (2011)). Some are supervised, like that in Chrupała (2014), making use of a Conditional Random Field (Lafferty et al., 2001) to Viachaslau Patsepnia IHS Inc. 55 Cambridge pkwy, Suite 601 Cambridge, MA 02142, USA Slava.Patsepnia@ihs.com learn the sequences of edit operations from labelled data. In this paper, we present an approach based on the usage of normalization lexicons and a CRF model for identifying potential candidates. 2 Task Description 2.1 Dataset The corpus provided by the organizers consists of 2950 annotated tweets. The annotations follow these guidelines (Baldwin et al., 2015): • Non-standard words are normalized to one or more canonic</context>
</contexts>
<marker>Chrupała, 2014</marker>
<rawString>Grzegorz Chrupała. 2014. Normalizing tweets with edit scripts and recurrent neural embeddings. In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (ACL 2014), pages 680–686, Baltimore, USA.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Stephan Gouws</author>
<author>Dirk Hovy</author>
<author>Donald Metzler</author>
</authors>
<title>Unsupervised mining of lexical variants from noisy text.</title>
<date>2011</date>
<booktitle>In Proceedings of the First workshop on Unsupervised Learning in NLP,</booktitle>
<pages>82--90</pages>
<location>Edinburgh, UK.</location>
<contexts>
<context position="1741" citStr="Gouws et al. (2011)" startWordPosition="275" endWordPosition="278"> texts, as they are often noisy, containing a lot of slang, typos, and abbreviations. Normalizing such text is challenging. We want to achieve high recall, making as many corrections as possible, but not at the expense of precision – words should not be incorrectly normalized. Previous approaches to this task incorporated different tools and methods: dictionaries, language models, finite state transducers, and machine translation models. Some of the methods are unsupervised, though often requiring adjustment of parameters based on annotated data (Han and Baldwin (2011), Liu et al. (2011), and Gouws et al. (2011)). Some are supervised, like that in Chrupała (2014), making use of a Conditional Random Field (Lafferty et al., 2001) to Viachaslau Patsepnia IHS Inc. 55 Cambridge pkwy, Suite 601 Cambridge, MA 02142, USA Slava.Patsepnia@ihs.com learn the sequences of edit operations from labelled data. In this paper, we present an approach based on the usage of normalization lexicons and a CRF model for identifying potential candidates. 2 Task Description 2.1 Dataset The corpus provided by the organizers consists of 2950 annotated tweets. The annotations follow these guidelines (Baldwin et al., 2015): • Non-</context>
</contexts>
<marker>Gouws, Hovy, Metzler, 2011</marker>
<rawString>Stephan Gouws, Dirk Hovy, and Donald Metzler. 2011. Unsupervised mining of lexical variants from noisy text. In Proceedings of the First workshop on Unsupervised Learning in NLP, pages 82–90, Edinburgh, UK.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Bo Han</author>
<author>Timothy Baldwin</author>
</authors>
<title>Lexical normalization of short text messages: Makn sens a #twitter.</title>
<date>2011</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011),</booktitle>
<pages>368--378</pages>
<location>Portland, USA.</location>
<contexts>
<context position="1697" citStr="Han and Baldwin (2011)" startWordPosition="266" endWordPosition="269">h is not the type of data found in social media texts, as they are often noisy, containing a lot of slang, typos, and abbreviations. Normalizing such text is challenging. We want to achieve high recall, making as many corrections as possible, but not at the expense of precision – words should not be incorrectly normalized. Previous approaches to this task incorporated different tools and methods: dictionaries, language models, finite state transducers, and machine translation models. Some of the methods are unsupervised, though often requiring adjustment of parameters based on annotated data (Han and Baldwin (2011), Liu et al. (2011), and Gouws et al. (2011)). Some are supervised, like that in Chrupała (2014), making use of a Conditional Random Field (Lafferty et al., 2001) to Viachaslau Patsepnia IHS Inc. 55 Cambridge pkwy, Suite 601 Cambridge, MA 02142, USA Slava.Patsepnia@ihs.com learn the sequences of edit operations from labelled data. In this paper, we present an approach based on the usage of normalization lexicons and a CRF model for identifying potential candidates. 2 Task Description 2.1 Dataset The corpus provided by the organizers consists of 2950 annotated tweets. The annotations follow the</context>
</contexts>
<marker>Han, Baldwin, 2011</marker>
<rawString>Bo Han and Timothy Baldwin. 2011. Lexical normalization of short text messages: Makn sens a #twitter. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011), pages 368-378, Portland, USA.</rawString>
</citation>
<citation valid="true">
<authors>
<author>John D Lafferty</author>
<author>Andrew McCallum</author>
<author>Fernando C N Pereira</author>
</authors>
<title>Conditional Random Fields: Probabilistic models for segmenting and labeling sequence data.</title>
<date>2001</date>
<booktitle>In Proceedings of the Eighteenth International Conference on Machine Learning, ICML&apos;01,</booktitle>
<pages>282--289</pages>
<publisher>Morgan Kaufmann Publishers Inc.,</publisher>
<location>San Francisco, CA, USA</location>
<contexts>
<context position="1859" citStr="Lafferty et al., 2001" startWordPosition="295" endWordPosition="298">lenging. We want to achieve high recall, making as many corrections as possible, but not at the expense of precision – words should not be incorrectly normalized. Previous approaches to this task incorporated different tools and methods: dictionaries, language models, finite state transducers, and machine translation models. Some of the methods are unsupervised, though often requiring adjustment of parameters based on annotated data (Han and Baldwin (2011), Liu et al. (2011), and Gouws et al. (2011)). Some are supervised, like that in Chrupała (2014), making use of a Conditional Random Field (Lafferty et al., 2001) to Viachaslau Patsepnia IHS Inc. 55 Cambridge pkwy, Suite 601 Cambridge, MA 02142, USA Slava.Patsepnia@ihs.com learn the sequences of edit operations from labelled data. In this paper, we present an approach based on the usage of normalization lexicons and a CRF model for identifying potential candidates. 2 Task Description 2.1 Dataset The corpus provided by the organizers consists of 2950 annotated tweets. The annotations follow these guidelines (Baldwin et al., 2015): • Non-standard words are normalized to one or more canonical English words based on a pre-defined lexicon. For instance, l o</context>
</contexts>
<marker>Lafferty, McCallum, Pereira, 2001</marker>
<rawString>John D. Lafferty, Andrew McCallum, and Fernando C. N. Pereira. 2001. Conditional Random Fields: Probabilistic models for segmenting and labeling sequence data. In Proceedings of the Eighteenth International Conference on Machine Learning, ICML&apos;01, pages 282–289. Morgan Kaufmann Publishers Inc., San Francisco, CA, USA</rawString>
</citation>
<citation valid="true">
<authors>
<author>Fei Liu</author>
<author>Fuliang Weng</author>
<author>Bingqing Wang</author>
<author>Yang Liu</author>
</authors>
<title>Insertion, deletion, or substitution? Normalizing text messages without precategorization nor supervision.</title>
<date>2011</date>
<booktitle>In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011),</booktitle>
<pages>71--76</pages>
<location>Portland, USA.</location>
<contexts>
<context position="1716" citStr="Liu et al. (2011)" startWordPosition="270" endWordPosition="273">a found in social media texts, as they are often noisy, containing a lot of slang, typos, and abbreviations. Normalizing such text is challenging. We want to achieve high recall, making as many corrections as possible, but not at the expense of precision – words should not be incorrectly normalized. Previous approaches to this task incorporated different tools and methods: dictionaries, language models, finite state transducers, and machine translation models. Some of the methods are unsupervised, though often requiring adjustment of parameters based on annotated data (Han and Baldwin (2011), Liu et al. (2011), and Gouws et al. (2011)). Some are supervised, like that in Chrupała (2014), making use of a Conditional Random Field (Lafferty et al., 2001) to Viachaslau Patsepnia IHS Inc. 55 Cambridge pkwy, Suite 601 Cambridge, MA 02142, USA Slava.Patsepnia@ihs.com learn the sequences of edit operations from labelled data. In this paper, we present an approach based on the usage of normalization lexicons and a CRF model for identifying potential candidates. 2 Task Description 2.1 Dataset The corpus provided by the organizers consists of 2950 annotated tweets. The annotations follow these guidelines (Bald</context>
</contexts>
<marker>Liu, Weng, Wang, Liu, 2011</marker>
<rawString>Fei Liu, Fuliang Weng, Bingqing Wang, and Yang Liu. 2011. Insertion, deletion, or substitution? Normalizing text messages without precategorization nor supervision. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011), pages 71-76, Portland, USA.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Fei Liu</author>
<author>Fuliang Weng</author>
<author>Xiao Jiang</author>
</authors>
<title>A broad-coverage normalization system for social media language.</title>
<date>2012</date>
<booktitle>In Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (ACL 2012),</booktitle>
<pages>1035--1044</pages>
<location>Jeju Island,</location>
<marker>Liu, Weng, Jiang, 2012</marker>
<rawString>Fei Liu, Fuliang Weng, and Xiao Jiang. 2012. A broad-coverage normalization system for social media language. In Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (ACL 2012), pages 1035-1044, Jeju Island, Korea.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Yi Yang</author>
<author>Jacob Eisenstein</author>
</authors>
<title>A log-linear model for unsupervised text normalization.</title>
<date>2013</date>
<booktitle>In Proceedings of Conference on Empirical Methods in Natura Language Processing (EMNLP),</booktitle>
<pages>61--72</pages>
<location>Seattle, USA.</location>
<marker>Yang, Eisenstein, 2013</marker>
<rawString>Yi Yang and Jacob Eisenstein. 2013. A log-linear model for unsupervised text normalization. In Proceedings of Conference on Empirical Methods in Natura Language Processing (EMNLP), pages 61-72, Seattle, USA.</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>