<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.004057">
<title confidence="0.995937">
DCU-ADAPT: Learning Edit Operations for Microblog Normalisation
with the Generalised Perceptron
</title>
<author confidence="0.996904">
Joachim Wagner and Jennifer Foster
</author>
<affiliation confidence="0.990771666666667">
ADAPT Centre
School of Computing
Dublin City University
</affiliation>
<address confidence="0.661108">
Dublin, Ireland
</address>
<email confidence="0.998845">
{jwagner|jfoster}@computing.dcu.ie
</email>
<sectionHeader confidence="0.997388" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.999982125">
We describe the work carried out by the
DCU-ADAPT team on the Lexical Nor-
malisation shared task at W-NUT 2015.
We train a generalised perceptron to an-
notate noisy text with edit operations that
normalise the text when executed. Fea-
tures are character n-grams, recurrent neu-
ral network language model hidden layer
activations, character class and eligibil-
ity for editing according to the task rules.
We combine predictions from 25 models
trained on subsets of the training data by
selecting the most-likely normalisation ac-
cording to a character language model. We
compare the use of a generalised percep-
tron to the use of conditional random fields
restricted to smaller amounts of training
data due to memory constraints. Fur-
thermore, we make a first attempt to ver-
ify Chrupała (2014)’s hypothesis that the
noisy channel model would not be useful
due to the limited amount of training data
for the source language model, i.e. the lan-
guage model on normalised text.
</bodyText>
<sectionHeader confidence="0.999471" genericHeader="introduction">
1 Introduction
</sectionHeader>
<bodyText confidence="0.99937215">
The W-NUT Lexical Normalisation for English
Tweets shared task is to normalise spelling and
to expand contractions in English microblog mes-
sages (Baldwin et al., 2015). This includes one-to-
many and many-to-one replacements as in “we’re”
and “l o v e”. Tokens containing characters other
than alphanumeric characters and the apostrophe
are excluded from the task, as well as proper nouns
and acronyms that would be acceptable in well-
edited text. (The input, however, does not identify
such tokens and unnecessarily modifying them is
penalised in the evaluation.)
To make evaluation easier, participants are fur-
ther required to align output tokens to input to-
kens, e.g. when the four tokens “l”, “o”, “v” and
“e” are amalgamated to the single token “love”,
three empty tokens must follow in the output. This
is easy for approaches that process the input token
by token but may require extra work if the input
string is processed differently.
We participate in the constrained mode that al-
lows off-the-shelf tools but no normalisation lexi-
cons and additional data to be used. Furthermore,
we do not use any lexicon of canonical English
but learn our normalisation model purely from the
provided training data.
Our approach follows previous work by
Chrupała (2014) in that we train a sequence la-
beller to annotate edit operations that are intended
to normalise the text when applied to the input
text. However, while Chrupała uses conditional
random fields for sequence labelling, we further
experiment with using a generalised Perceptron
and with using a simple noisy channel model with
character n-gram language models trained on the
normalised side of the training data to select the fi-
nal normalisation from a set of candidate normali-
sation generated from an ensemble of sequence la-
bellers and from selectively ignoring some of the
proposed edit operations.
</bodyText>
<sectionHeader confidence="0.999125" genericHeader="method">
2 Experimental Setup
</sectionHeader>
<subsectionHeader confidence="0.999977">
2.1 Data Set and Cross-validation
</subsectionHeader>
<bodyText confidence="0.996930181818182">
The microblog data set of the shared task contains
2,950 tweets for training and 1,967 tweets for fi-
nal testing. Each tweet is tokenised and the tokens
of the normalised tweets are aligned to the input,
allowing for one-to-one, many-to-one and one-to-
many alignments.
For five-fold cross-validation, we sort the train-
ing data by tweet ID and split it into 5 sets of
roughly the same number of tokens. (The num-
ber of tweets varies from 579 to 606.) Systems
are trained on four sets and tested on the remain-
</bodyText>
<page confidence="0.990374">
93
</page>
<note confidence="0.7882725">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 93–98,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<bodyText confidence="0.9997582">
ing set. Since the sequence labellers require a
development set, we split the union of the four
sets again into 5 sets to carry out nested cross-
validation, training 25 models in total for each sys-
tem.
</bodyText>
<subsectionHeader confidence="0.999133">
2.2 Feature Extraction
</subsectionHeader>
<bodyText confidence="0.999909777777778">
For extracting recurrent neural network language
model features, we use Elman1 (Chrupała, 2014),
a modification of the RNNLM toolkit2 (Mikolov et
al., 2010; Mikolov, 2012) that outputs hidden layer
activations. We use the off-the-shelf model from
Chrupała (2014)3. The input are the characters of
the tweet4 in one-hot encoding. The network has a
hidden layer with 400 neurons and it predicts the
next byte. Following Chrupała (2014), we reduce
the 400 activations to 10 binary features: We se-
lect the 10 most active neurons in order and apply
a threshold (0.5) to the activation. The value of the
i-th feature expresses which neuron was i-th active
and whether its activation was below 0.5, e.g. the
first feature states which neuron is most active and
whether or not its activation is below 0.5. As there
are 400 neurons and 2 possible binarised activa-
tions, there are 800 possible values.5
Edit operations are extracted from the parallel
training data searching for the lowest edit distance
and recording the edit operations with dynamic
programming. We customise the edit costs func-
tion to always postpone insertions to after delet-
ing characters so that each input character can be
assigned exactly one edit operation from the set
{do nothing, delete character, insert string before
character}. To capture insertions at the end of the
tweet, we append a NULL byte to all tweets.
The above setup, features and edit operations
are identical to Chrupała (2014) to the best of our
knowledge. We further add a character class fea-
ture {NULL, control, space, apostrophe, punctua-
tion, digit, quote, bracket, lowercase letter, upper-
case letter, non-ASCII, other} and a feature indi-
cating whether the character is part of a token that
is eligible for editing according to the shared task
</bodyText>
<footnote confidence="0.990881727272727">
1https://bitbucket.org/gchrupala/elman
2http://rnnlm.org/
3https://bitbucket.org/gchrupala/
codeswitch/overview
4More precisely, we process UTF-8 bytes. For the train-
ing data, this is the same as characters as the training set does
not contain any multi-byte UTF-8 characters.
5These RNN-LM hidden layer activation features have
been used successfully in text segmentation and word-level
language identification (Chrupała, 2013; Barman et al.,
2014).
</footnote>
<bodyText confidence="0.997521666666667">
rules, i.e. whether or not the characters encoun-
tered since the last space or start of tweet only are
letters, digits, apostrophes and spaces.
</bodyText>
<subsectionHeader confidence="0.999921">
2.3 Sequence Labelling
</subsectionHeader>
<bodyText confidence="0.999932227272728">
For character-level sequence labelling, we try (a)
Sequor6 (Chrupała and Klakow, 2010), an imple-
mentation of the generalised perceptron (Collins,
2002),7 with 10 iterations, and (b) Wapiti8
(Lavergne et al., 2010)’s implementation of con-
ditional random fields (Lafferty et al., 2001) using
l-bfgs optimisation with a history of 5 steps, elas-
tic net regularisation (p1 = 0.333 and p2 = 0.001)
and no hard limit on the number of iterations. We
extend the feature templates of Chrupała (2014)9
by including our additional two features. The tem-
plate generates unigram, bigram and trigram char-
acter features within a +/- 2 window. All remain-
ing features are included as unigrams of the cur-
rent value.
Due to the nested cross-validation (see above),
Sequor is trained on 64% (0.82) of the training
data, 16% (0.8 x 0.2) is used as development set
and 20% (1/5) for testing. For Wapiti, we use only
16% for training (and the remaining 64% for de-
velopment set) in each cross-validation fold due to
memory constraints.10
</bodyText>
<subsectionHeader confidence="0.99428">
2.4 Generating Candidates
</subsectionHeader>
<bodyText confidence="0.999986454545455">
We produce candidate normalisations from the
edit operations proposed by the sequence model.
However, if we allowed each insert and delete op-
eration to be either realised or not, we would pro-
duce up to 2N candidates, where N is the num-
ber of edit operations. With N = 140 (maximum
lengths of a tweet), handling these many candi-
dates is not feasible. Instead, we recursively split
the sequence of edit operations produced by the
sequence labeller into up to eight sections. To find
good split points, we propose to minimise
</bodyText>
<equation confidence="0.92259">
11
|eL − eR |+ max({0,10 − s})/2 (1)
</equation>
<footnote confidence="0.888513833333333">
6https://bitbucket.org/gchrupala/
sequor
7The generalised perceptron has been shown to match per-
formance of state-of-the-art methods in word segmentation,
POS tagging, dependency parsing and phrase-structure pars-
ing (Zhang and Clark, 2011).
8https://wapiti.limsi.fr/
9We thank Grzegorz Chrupała for providing his template
and for translating it to the Sequor template format.
10With 64%, memory usage grew to over 400 GB over
night, causing heavy swap activity on our machines with 256
GB RAM (and 410 GB swap space).
</footnote>
<page confidence="0.999252">
94
</page>
<bodyText confidence="0.999954958333333">
where eL and eR are the number of insert or delete
operations to the left and right respectively, and s
is the number of consecutive no-operations to the
left. The first term tries to balance the number of
edit operations on each side while the second term
introduces a preference to not split clusters of edit
operations.
For each section, we either use the edit opera-
tions produced by the sequence labeller or do not
edit the section. As we split each sequence into
no more than eight sections, we produce up to
28 = 256 candidates.11 Only one candidate, iden-
tical to the input, will be produced if there are no
delete or insert operations and two candidates will
be produced if there is just one delete or insert op-
eration.
In training, we may potentially produce up to
5x256 = 1,280 candidates per tweet as the nested
cross-validation gives us five sequence labellers
per cross-validation run. During testing, up to
25 x 256 = 6,400 candidates may be produced.
(The actual maximum number of candidates may
be lower when labellers agree on the edit opera-
tions.)
</bodyText>
<subsectionHeader confidence="0.994121">
2.5 Applying Edit Operations
</subsectionHeader>
<bodyText confidence="0.98696325">
After producing candidate edit operation se-
quences that use subsets of the edit operations pre-
dicted by a sequence model, the edit operations
are executed to produce candidates strings for the
normalised tweets. As the shared task asks for to-
kenised output aligned to the input tokens, we ap-
ply the edit operations to each token in the follow-
ing sequence:
</bodyText>
<listItem confidence="0.996203818181818">
1. Apply all edit operations at character posi-
tions that correspond to input tokens.
2. Apply insert operations recorded at the space
between tokens and at the end of the tweet to
the preceding token.
3. Apply delete operations at the space between
tokens, moving the contents of the token to
the right to the end of the token to the left,
leaving behind an empty token. (Delete op-
erations at the end-of-tweet marker are ig-
nored.)
</listItem>
<bodyText confidence="0.968190666666667">
Due to time constraints, we do not attempt to
improve the alignment of output tokens to input
tokens.
</bodyText>
<footnote confidence="0.984662">
11Splitting the eight sections again would produce 216 =
65,536 candidates.
</footnote>
<subsectionHeader confidence="0.970816">
2.6 Language Modelling
</subsectionHeader>
<bodyText confidence="0.999897125">
For language modelling, we train SRILM (Stol-
cke, 2002) on the normalised tweets of the training
data. As we want to build character n-gram mod-
els and SRILM has no direct support for this, we
re-format the candidate strings to make each char-
acter a token. To distinguish space characters from
token separators, we represent them with double
underscores.
</bodyText>
<subsectionHeader confidence="0.989241">
2.7 Candidate Selection
</subsectionHeader>
<bodyText confidence="0.999962">
We use the noisy channel model12 to select the
most plausible source sˆ for the observed target t
from the set of candidates S(t):
</bodyText>
<equation confidence="0.972581">
arg max P(t s)P(s) (2)
s∈S(t)
</equation>
<bodyText confidence="0.901108444444445">
P(s) is provided by the language model (Sec-
tion 2.6). Standard models give high probability
to making few or no edits. However, we trust our
sequence models as Chrupała (2014) reported en-
couraging results. Therefore, we give high prob-
ability to using the predicted edit operations. We
consider two models for P(t s):
{ 0.979 if all edit operations are used
0.020 if s = t
</bodyText>
<equation confidence="0.619618666666667">
0.001 otherwise
P2 (t s) = f 1 if all edit operations are used
0 otherwise
</equation>
<bodyText confidence="0.979943230769231">
Note that P1 is not a proper probability model as
there is never exactly one “otherwise” case but
2z −2 cases where i is the number of sections con-
sidered in candidate generation, causing the total
to be either 0.999 or between 1.001 and 0.999 +
0.001 x (28 − 2) = 1.253. P2 effectively excludes
the original input and all candidates that use only
some but not all of the edit operations suggested
by the sequence labellers. Since there are five se-
quence labellers per cross-validation fold due to
nested cross-validation and 25 sequence labellers
during testing, P2 effectively selects between 5 or
25 candidates.13
</bodyText>
<footnote confidence="0.796797875">
12The noisy channel model has been applied success-
fully to spelling correction (Kemighan et al., 1990; Wilcox-
O’Hearn et al., 2008) and machine translation (Way, 2010),
among other areas.
13Han et al. (2013) also use a trigram language model for
normalisation, but only to reduce a larger candidate set to an
P1(t s) =
and
</footnote>
<page confidence="0.925573">
95
</page>
<table confidence="0.998675">
2 3 4 5 6
WB 14.70 9.97 7.91 7.31 7.19
KN 14.73 9.83 7.81 7.33 7.43
GT 14.63 9.88 7.91 7.45 7.44
</table>
<tableCaption confidence="0.995671">
Table 1: Average language model perplexity over
</tableCaption>
<bodyText confidence="0.9880666">
the five cross-validation runs for n-gram sizes n =
2,..., 6 and smoothing methods WB = Witten-
Bell, KN = Keyser-Ney and GT = Good-Turing.
Standard deviation Q G 0.23 for all configura-
tions.
</bodyText>
<subsectionHeader confidence="0.987479">
2.8 Evaluation Measures
</subsectionHeader>
<bodyText confidence="0.997958">
We evaluate our best systems using the evalution
script provided by the shared task organisers. It
counts:
</bodyText>
<listItem confidence="0.926268307692308">
• The number of correctly modified tokens, i.e.
tokens that need to be replaced by a new non-
empty token and the system correctly pre-
dicts this token.
• Number of tokens needing normalisation, i.e.
tokens that are modified in the gold output.
However, again, tokens that are to be deleted
are ignored, e.g. “l o v e” to “love” counts
as one event only despite the replacement of
three tokens with empty tokens.
• The number of tokens modified by system,
i.e. tokens for which a substitution with a
non-empty token is proposed by the system.
</listItem>
<bodyText confidence="0.9998936">
Based on these numbers, precision, recall and F1-
score are calculated and we select the system and
configuration to be used on the test set based
on highest average F1-score over the 5 cross-
validation runs.
</bodyText>
<sectionHeader confidence="0.999962" genericHeader="method">
3 Results
</sectionHeader>
<bodyText confidence="0.998406166666667">
We use character n-gram language models in the
noisy channel model for candidate selection. To
address sparsity of data that arises when test sen-
tences contain n-grams that are rare or unseen in
the training data, we try Witten-Bell, Keyser-Ney
and Good-Turing smoothing. Table 1 shows av-
erage cross-validation perplexity for these three
smoothing methods and n = 2, ..., 6. Over all
five cross-validation folds, the language model
that gives the lowest perplexity when trained
n-best list before applying more complex models to token-
level candidate selection.
</bodyText>
<table confidence="0.9978046">
P R F1
P1 W 83.2% 37.7% 51.9%
P1 S 83.2% 41.0% 54.9%
P2 W 85.9% 47.7% 61.4%
P2 S 85.7% 56.1% 67.8%
</table>
<tableCaption confidence="0.984117">
Table 2: Average cross-validation results over the
</tableCaption>
<bodyText confidence="0.996128">
five cross-validation runs for transition models
P1 and P2, W = Wapiti CRF sequence labeller
(trained on only 16% of the training data), S =
Sequor generalised perceptron sequence labeller
(trained on 64% of the training data), P = preci-
sion, R = recall, F1 = F1 measure. Standard devi-
ation Q G 0.03 for all cells.
on the training data and applied to the internal
test set is the 6-gram model with Witten-Bell
smoothing. This confirms the recommendations
in the SRILM documentation to use Witten-Bell
smoothing when the vocabulary is small such as
when building a character language model.
Table 2 shows cross-validation results for the
four systems resulting from the choices between
transition models P1 and P2 and using the Wapiti
CRF or the Sequor generalised perceptron se-
quence labeller. The differences are not large in
precision but for recall, the model P1 performs
poorly. Also the CRF consistently has lower re-
call than the respective perceptron model. Inter-
estingly, the CRF achieves best precision. On F1-
score, the best result is obtained with model P2,
which reduces the noisy channel model to selec-
tion between sequence modeller hypotheses, to-
gether with the Sequor sequence modeller.
On the final test set, our best system using P2
and Sequor has precision 81.90%, recall 55.09%
and F1 65.87%, placing it fifth out of six submis-
sions in the “constrained” category.
</bodyText>
<sectionHeader confidence="0.998938" genericHeader="method">
4 Discussion
</sectionHeader>
<bodyText confidence="0.999974">
A possible explanation for the low recall obtained
with the P1 model is that this noise model cannot
counter the effect that shorter sentences generally
receive higher language model probability scores
and therefore there is a tendency to reject edit op-
erations that insert additional characters.
Furthermore, we observe that our system often
assigns inserted text to the wrong evaluation units,
e.g. inserting the string “ laughing out” before the
space before “lol” and then replacing second “L”
of “lol” with “ud”. This is not wrong on the string
</bodyText>
<page confidence="0.979318">
96
</page>
<bodyText confidence="0.9998935">
level, but in the token-level evaluation, we make
two errors: wrongly appending “ laughing out” to
the previous token and wrongly normalising “lol”
to just “loud” instead of “laughing out loud”.
Since the model P1 did not come out best, we
cannot reject Chrupała (2014)’s hypothesis that
the noisy channel model would not be useful.
However, our observations also do not provide
much support for this hypothesis as we did not in-
clude standard models from previous work (Cook
and Stevenson, 2009; Han et al., 2013) in our ex-
periment.
</bodyText>
<sectionHeader confidence="0.996388" genericHeader="conclusions">
5 Conclusions
</sectionHeader>
<bodyText confidence="0.9986055">
We trained two sequence modellers to predict edit
operations that normalise input text when exe-
cuted and experimented with applying the noisy
channel model to selecting candidate normalisa-
tion strings.
Future work should:
</bodyText>
<listItem confidence="0.996965966666666">
• Train the CRF on the full training data, either
using a more memory-friendly (but possibly
slower) optimisation method or using an even
larger machine.
• Experiment with LSTM sequence modelling
(Hochreiter and Schmidhuber, 1997; Gers,
2001), which has been applied successfully
to speech recognition and caption genera-
tion (Graves and Jaitly, 2014; Vinyals et al.,
2015).
• Combine models with voting rather than lan-
guage model score.
• For the noisy channel model, try stan-
dard models from previous work (Cook and
Stevenson, 2009; Han et al., 2013).
• To better understand the selection prefer-
ences of the noisy channel model, com-
pare the F1-score obtained when evaluating
against the gold data to the F1-score obtained
when evaluating the system output against its
own input, i.e. are we biased towards doing
nothing?
• Introduce a brevity penalty to counter the ef-
fect of selecting short candidate normalisa-
tions in the noisy channel model.
• Automatically revise the alignment to in-
put token according to global co-occurrence
statistics.
• Carry out a full error analysis of what the sys-
tem does well and where it fails.
</listItem>
<sectionHeader confidence="0.98172" genericHeader="acknowledgments">
Acknowledgments
</sectionHeader>
<bodyText confidence="0.999866875">
This research is supported by Science Foun-
dation Ireland through the CNGL Programme
(Grant 12/CE/I2267) in the ADAPT Centre
(www.adaptcentre.ie) at Dublin City University.
We thank the anonymous reviewers for their com-
ments on this paper. Furthermore, we thank Grze-
gorz Chrupała for sharing his feature templates
and for his suggestion to try Sequor.
</bodyText>
<sectionHeader confidence="0.998952" genericHeader="references">
References
</sectionHeader>
<reference confidence="0.9733635">
Timothy Baldwin, Marie Catherine de Marneffe,
Bo Han, Young-Bum Kim, Alan Ritter, and Wei
Xu. 2015. Shared tasks of the 2015 workshop on
noisy user-generated text: Twitter lexical normal-
ization and named entity recognition. In Proceed-
ings of the Workshop on Noisy User-generated Text
(WNUT 2015), Beijing, China.
Utsab Barman, Joachim Wagner, Grzegorz Chrupała,
and Jennifer Foster. 2014. DCU-UVT: Word-
level language classification with code-mixed data.
In Proceedings of the First Workshop on Computa-
tional Approaches to Code Switching. EMNLP 2014,
Conference on Empirical Methods in Natural Lan-
guage Processing, pages 127–132, Doha, Qatar, Oc-
tober. Association for Computational Linguistics.
Grzegorz Chrupała and Dietrich Klakow. 2010.
A named entity labeler for German: Exploiting
Wikipedia and distributional clusters. In Nico-
letta Calzolari (Conference Chair), Khalid Choukri,
Bente Maegaard, Joseph Mariani, Jan Odijk, Ste-
lios Piperidis, Mike Rosner, and Daniel Tapias,
editors, Proceedings of the Seventh conference on
International Language Resources and Evaluation
(LREC’10), Valletta, Malta, May. European Lan-
guage Resources Association (ELRA).
Grzegorz Chrupała. 2013. Text segmentation with
character-level text embeddings. In Proceedings of
the ICML 2013 Workshop on Deep Learning forAu-
dio, Speech and Language Processing, Atlanta, GA,
USA. https://sites.google.com/site/
deeplearningicml2013/accepted_
papers.
Grzegorz Chrupała. 2014. Normalizing tweets with
edit scripts and recurrent neural embeddings. In
Proceedings of the 52nd Annual Meeting of the As-
sociation for Computational Linguistics (Volume 2:
Short Papers), pages 680–686, Baltimore, Mary-
land, June. Association for Computational Linguis-
tics.
Michael Collins. 2002. Discriminative training meth-
ods for hidden Markov models: Theory and experi-
ments with perceptron algorithms. In Proceedings
</reference>
<page confidence="0.997376">
97
</page>
<reference confidence="0.99843596039604">
of the 2002 Conference on Empirical Methods in
Natural Language Processing (EMNLP’02), pages
1–8, Morristown, NJ, USA, July. Association for
Computational Linguistics.
Paul Cook and Suzanne Stevenson. 2009. An un-
supervised model for text message normalization.
In Proceedings of the Workshop on Computational
Approaches to Linguistic Creativity, pages 71–78,
Boulder, Colorado, June. Association for Computa-
tional Linguistics.
Felix Gers. 2001. Long Short-Term Memory in
Recurrent Neural Networks. Ph.D. thesis, ´Ecole
Polytechnique F´ed´erale de Lausanne, D´epartement
d’Informatique, Lausanne. Switzerland. http://
www.felixgers.de/papers/phd.pdf.
Alex Graves and Navdeep Jaitly. 2014. Towards
end-to-end speech recognition with recurrent neu-
ral networks. In Eric P. Xing and Tony Jebara, ed-
itors, Proceedings of The 31st International Con-
ference on Machine Learning, volume 32 of JMLR
Workshop and Conference Proceedings. http:
//jmlr.org/proceedings/papers/v32/.
Bo Han, Paul Cook, and Timothy Baldwin. 2013.
Lexical normalization for social media text. ACM
Transactions on Intelligent Systems and Technol-
ogy (TIST) - Special section on twitter and mi-
croblogging services, social recommender sys-
tems, and CAMRa2010: Movie recommenda-
tion in context archive, 4(1):5:1–5:27. doi
10.1145/2414425.2414430.
Sepp Hochreiter and J¨urgen Schmidhuber. 1997.
Long short-term memory. Neural Computation,
9(8):1735–1780. doi:10.1162/neco.1997.9.8.1735.
Mark D. Kemighan, Kenneth W. Church, and
William A. Gale. 1990. A spelling correction pro-
gram based on a noisy channel model. In Hans
Karlgren, editor, COLING-90: Papers presented to
the 13th International Conference on Computational
Linguistics on the occasion of the 25th Anniversary
of COLING and the 350th Anniversary of Helsinki
University, Volume 2. http://www.aclweb.
org/anthology/C/C90/.
John Lafferty, Andrew McCallum, and Fernando CN
Pereira. 2001. Conditional random fields: Prob-
abilistic models for segmenting and labeling se-
quence data. pages 282–289.
Thomas Lavergne, Olivier Capp´e, and Franc¸ois Yvon.
2010. Practical very large scale crfs. In Proceed-
ings of the 48th Annual Meeting of the Association
for Computational Linguistics, pages 504–513, Up-
psala, Sweden, July. Association for Computational
Linguistics.
T. Mikolov, M. Karafi´at, L. Burget, J. ˇCernock´y, and
S. Khudanpur. 2010. Recurrent neural network
based language model. In Proceedings of the 11th
Annual Conference of the International Speech
Communication Association (INTERSPEECH
2010), Makuhari, Chiba, Japan. International
Speech Communication Association (ICSA).
http://www.fit.vutbr.cz/research/
groups/speech/publi/2010/mikolov_
interspeech2010_IS100722.pdf.
Tom´aˇs Mikolov. 2012. Statistical Language Mod-
els based on Neural Networks. Ph.D. thesis,
Brno University of Technology, Faculty of In-
formation Technology, Department of Computer
Graphics and Multimedia, Brno, Czech Republic.
http://www.fit.vutbr.cz/˜imikolov/
rnnlm/thesis.pdf.
Andreas Stolcke. 2002. SRILM — an extensible lan-
guage modeling toolkit. In John H. L. Hansen and
Bryan Pellom, editors, Proceedings of the 7th Inter-
national Conference on Spoken Language Process-
ing (ICSLP2002), volume 2, pages 901–904, Baixas,
France. International Speech Communication Asso-
ciation (ISCA).
Oriol Vinyals, Alexander Toshev, Samy Bengio, and
Dumitru Erhan. 2015. Show and tell: A neural
image caption generator. http://arxiv.org/
abs/1411.4555, to appear in Computer Vision
and Pattern Recognition.
Andy Way. 2010. Machine translation. In Alexander
Clark, Chris Fox, and Shalom Lappin, editors, The
Handbook of Computational Linguistics and Nat-
ural Language Processing, pages 531–573. Wiley
Blackwell, Chichester, UK, July.
Amber Wilcox-O’Hearn, Graeme Hirst, and Alexander
Budanitsky. 2008. Real-word spelling correction
with trigrams: A reconsideration of the Mays,
Damerau, and Mercer model. In Alexander
Gelbukh, editor, Computational Linguistics and
Intelligent Text Processing - 9th International Con-
ference, CICLing 2008, Haifa, Isarael, February
17–23, 2008 - Proceedings, volume 4919/2008,
pages 605–616. Springer Berlin/Heidelberg,
Germany. 2006 draft version available on
http://ftp.cs.toronto.edu/pub/gh/WilcoxOHearn-
etal-2006.pdf.
Yue Zhang and Stephen Clark. 2011. Syntactic pro-
cessing using the generalized perceptron and beam
search. Computational Linguistics, 37(1):105–151.
</reference>
<page confidence="0.996161">
98
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.553479">
<title confidence="0.9980885">DCU-ADAPT: Learning Edit Operations for Microblog with the Generalised Perceptron</title>
<author confidence="0.936795">Wagner</author>
<affiliation confidence="0.963308666666667">ADAPT School of Dublin City</affiliation>
<address confidence="0.666345">Dublin,</address>
<abstract confidence="0.99856284">We describe the work carried out by the DCU-ADAPT team on the Lexical Normalisation shared task at W-NUT 2015. We train a generalised perceptron to annotate noisy text with edit operations that normalise the text when executed. Feaare character recurrent neural network language model hidden layer activations, character class and eligibility for editing according to the task rules. We combine predictions from 25 models trained on subsets of the training data by selecting the most-likely normalisation according to a character language model. We compare the use of a generalised perceptron to the use of conditional random fields restricted to smaller amounts of training data due to memory constraints. Furthermore, we make a first attempt to verify Chrupała (2014)’s hypothesis that the noisy channel model would not be useful due to the limited amount of training data for the source language model, i.e. the language model on normalised text.</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="true">
<authors>
<author>Timothy Baldwin</author>
<author>Marie Catherine de Marneffe</author>
<author>Bo Han</author>
<author>Young-Bum Kim</author>
<author>Alan Ritter</author>
<author>Wei Xu</author>
</authors>
<title>Shared tasks of the 2015 workshop on noisy user-generated text: Twitter lexical normalization and named entity recognition.</title>
<date>2015</date>
<booktitle>In Proceedings of the Workshop on Noisy User-generated Text (WNUT 2015),</booktitle>
<location>Beijing, China.</location>
<marker>Baldwin, de Marneffe, Han, Kim, Ritter, Xu, 2015</marker>
<rawString>Timothy Baldwin, Marie Catherine de Marneffe, Bo Han, Young-Bum Kim, Alan Ritter, and Wei Xu. 2015. Shared tasks of the 2015 workshop on noisy user-generated text: Twitter lexical normalization and named entity recognition. In Proceedings of the Workshop on Noisy User-generated Text (WNUT 2015), Beijing, China.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Utsab Barman</author>
<author>Joachim Wagner</author>
<author>Grzegorz Chrupała</author>
<author>Jennifer Foster</author>
</authors>
<title>DCU-UVT: Wordlevel language classification with code-mixed data.</title>
<date>2014</date>
<booktitle>In Proceedings of the First Workshop on Computational Approaches to Code Switching. EMNLP 2014, Conference on Empirical Methods in Natural Language Processing,</booktitle>
<pages>127--132</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Doha, Qatar,</location>
<contexts>
<context position="6262" citStr="Barman et al., 2014" startWordPosition="995" endWordPosition="998">se letter, uppercase letter, non-ASCII, other} and a feature indicating whether the character is part of a token that is eligible for editing according to the shared task 1https://bitbucket.org/gchrupala/elman 2http://rnnlm.org/ 3https://bitbucket.org/gchrupala/ codeswitch/overview 4More precisely, we process UTF-8 bytes. For the training data, this is the same as characters as the training set does not contain any multi-byte UTF-8 characters. 5These RNN-LM hidden layer activation features have been used successfully in text segmentation and word-level language identification (Chrupała, 2013; Barman et al., 2014). rules, i.e. whether or not the characters encountered since the last space or start of tweet only are letters, digits, apostrophes and spaces. 2.3 Sequence Labelling For character-level sequence labelling, we try (a) Sequor6 (Chrupała and Klakow, 2010), an implementation of the generalised perceptron (Collins, 2002),7 with 10 iterations, and (b) Wapiti8 (Lavergne et al., 2010)’s implementation of conditional random fields (Lafferty et al., 2001) using l-bfgs optimisation with a history of 5 steps, elastic net regularisation (p1 = 0.333 and p2 = 0.001) and no hard limit on the number of itera</context>
</contexts>
<marker>Barman, Wagner, Chrupała, Foster, 2014</marker>
<rawString>Utsab Barman, Joachim Wagner, Grzegorz Chrupała, and Jennifer Foster. 2014. DCU-UVT: Wordlevel language classification with code-mixed data. In Proceedings of the First Workshop on Computational Approaches to Code Switching. EMNLP 2014, Conference on Empirical Methods in Natural Language Processing, pages 127–132, Doha, Qatar, October. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="false">
<authors>
<author>Grzegorz Chrupała</author>
<author>Dietrich Klakow</author>
</authors>
<title>A named entity labeler for German: Exploiting Wikipedia and distributional clusters.</title>
<date>2010</date>
<booktitle>Proceedings of the Seventh conference on International Language Resources and Evaluation (LREC’10),</booktitle>
<editor>In Nicoletta Calzolari (Conference Chair), Khalid Choukri, Bente Maegaard, Joseph Mariani, Jan Odijk, Stelios Piperidis, Mike Rosner, and Daniel Tapias, editors,</editor>
<location>Valletta, Malta,</location>
<contexts>
<context position="6516" citStr="Chrupała and Klakow, 2010" startWordPosition="1034" endWordPosition="1037">org/gchrupala/ codeswitch/overview 4More precisely, we process UTF-8 bytes. For the training data, this is the same as characters as the training set does not contain any multi-byte UTF-8 characters. 5These RNN-LM hidden layer activation features have been used successfully in text segmentation and word-level language identification (Chrupała, 2013; Barman et al., 2014). rules, i.e. whether or not the characters encountered since the last space or start of tweet only are letters, digits, apostrophes and spaces. 2.3 Sequence Labelling For character-level sequence labelling, we try (a) Sequor6 (Chrupała and Klakow, 2010), an implementation of the generalised perceptron (Collins, 2002),7 with 10 iterations, and (b) Wapiti8 (Lavergne et al., 2010)’s implementation of conditional random fields (Lafferty et al., 2001) using l-bfgs optimisation with a history of 5 steps, elastic net regularisation (p1 = 0.333 and p2 = 0.001) and no hard limit on the number of iterations. We extend the feature templates of Chrupała (2014)9 by including our additional two features. The template generates unigram, bigram and trigram character features within a +/- 2 window. All remaining features are included as unigrams of the curre</context>
</contexts>
<marker>Chrupała, Klakow, 2010</marker>
<rawString>Grzegorz Chrupała and Dietrich Klakow. 2010. A named entity labeler for German: Exploiting Wikipedia and distributional clusters. In Nicoletta Calzolari (Conference Chair), Khalid Choukri, Bente Maegaard, Joseph Mariani, Jan Odijk, Stelios Piperidis, Mike Rosner, and Daniel Tapias, editors, Proceedings of the Seventh conference on International Language Resources and Evaluation (LREC’10), Valletta, Malta, May. European Language Resources Association (ELRA).</rawString>
</citation>
<citation valid="true">
<authors>
<author>Grzegorz Chrupała</author>
</authors>
<title>Text segmentation with character-level text embeddings.</title>
<date>2013</date>
<booktitle>In Proceedings of the ICML 2013 Workshop on Deep Learning forAudio, Speech and Language Processing,</booktitle>
<location>Atlanta, GA, USA.</location>
<note>https://sites.google.com/site/ deeplearningicml2013/accepted_ papers.</note>
<contexts>
<context position="6240" citStr="Chrupała, 2013" startWordPosition="993" endWordPosition="994">bracket, lowercase letter, uppercase letter, non-ASCII, other} and a feature indicating whether the character is part of a token that is eligible for editing according to the shared task 1https://bitbucket.org/gchrupala/elman 2http://rnnlm.org/ 3https://bitbucket.org/gchrupala/ codeswitch/overview 4More precisely, we process UTF-8 bytes. For the training data, this is the same as characters as the training set does not contain any multi-byte UTF-8 characters. 5These RNN-LM hidden layer activation features have been used successfully in text segmentation and word-level language identification (Chrupała, 2013; Barman et al., 2014). rules, i.e. whether or not the characters encountered since the last space or start of tweet only are letters, digits, apostrophes and spaces. 2.3 Sequence Labelling For character-level sequence labelling, we try (a) Sequor6 (Chrupała and Klakow, 2010), an implementation of the generalised perceptron (Collins, 2002),7 with 10 iterations, and (b) Wapiti8 (Lavergne et al., 2010)’s implementation of conditional random fields (Lafferty et al., 2001) using l-bfgs optimisation with a history of 5 steps, elastic net regularisation (p1 = 0.333 and p2 = 0.001) and no hard limit </context>
</contexts>
<marker>Chrupała, 2013</marker>
<rawString>Grzegorz Chrupała. 2013. Text segmentation with character-level text embeddings. In Proceedings of the ICML 2013 Workshop on Deep Learning forAudio, Speech and Language Processing, Atlanta, GA, USA. https://sites.google.com/site/ deeplearningicml2013/accepted_ papers.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Grzegorz Chrupała</author>
</authors>
<title>Normalizing tweets with edit scripts and recurrent neural embeddings.</title>
<date>2014</date>
<booktitle>In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers),</booktitle>
<pages>680--686</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Baltimore, Maryland,</location>
<contexts>
<context position="1031" citStr="Chrupała (2014)" startWordPosition="155" endWordPosition="156">ith edit operations that normalise the text when executed. Features are character n-grams, recurrent neural network language model hidden layer activations, character class and eligibility for editing according to the task rules. We combine predictions from 25 models trained on subsets of the training data by selecting the most-likely normalisation according to a character language model. We compare the use of a generalised perceptron to the use of conditional random fields restricted to smaller amounts of training data due to memory constraints. Furthermore, we make a first attempt to verify Chrupała (2014)’s hypothesis that the noisy channel model would not be useful due to the limited amount of training data for the source language model, i.e. the language model on normalised text. 1 Introduction The W-NUT Lexical Normalisation for English Tweets shared task is to normalise spelling and to expand contractions in English microblog messages (Baldwin et al., 2015). This includes one-tomany and many-to-one replacements as in “we’re” and “l o v e”. Tokens containing characters other than alphanumeric characters and the apostrophe are excluded from the task, as well as proper nouns and acronyms that</context>
<context position="2487" citStr="Chrupała (2014)" startWordPosition="393" endWordPosition="394"> to input tokens, e.g. when the four tokens “l”, “o”, “v” and “e” are amalgamated to the single token “love”, three empty tokens must follow in the output. This is easy for approaches that process the input token by token but may require extra work if the input string is processed differently. We participate in the constrained mode that allows off-the-shelf tools but no normalisation lexicons and additional data to be used. Furthermore, we do not use any lexicon of canonical English but learn our normalisation model purely from the provided training data. Our approach follows previous work by Chrupała (2014) in that we train a sequence labeller to annotate edit operations that are intended to normalise the text when applied to the input text. However, while Chrupała uses conditional random fields for sequence labelling, we further experiment with using a generalised Perceptron and with using a simple noisy channel model with character n-gram language models trained on the normalised side of the training data to select the final normalisation from a set of candidate normalisation generated from an ensemble of sequence labellers and from selectively ignoring some of the proposed edit operations. 2 </context>
<context position="4126" citStr="Chrupała, 2014" startWordPosition="663" endWordPosition="664">y the same number of tokens. (The number of tweets varies from 579 to 606.) Systems are trained on four sets and tested on the remain93 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 93–98, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics ing set. Since the sequence labellers require a development set, we split the union of the four sets again into 5 sets to carry out nested crossvalidation, training 25 models in total for each system. 2.2 Feature Extraction For extracting recurrent neural network language model features, we use Elman1 (Chrupała, 2014), a modification of the RNNLM toolkit2 (Mikolov et al., 2010; Mikolov, 2012) that outputs hidden layer activations. We use the off-the-shelf model from Chrupała (2014)3. The input are the characters of the tweet4 in one-hot encoding. The network has a hidden layer with 400 neurons and it predicts the next byte. Following Chrupała (2014), we reduce the 400 activations to 10 binary features: We select the 10 most active neurons in order and apply a threshold (0.5) to the activation. The value of the i-th feature expresses which neuron was i-th active and whether its activation was below 0.5, e.g</context>
<context position="5492" citStr="Chrupała (2014)" startWordPosition="889" endWordPosition="890">d activations, there are 800 possible values.5 Edit operations are extracted from the parallel training data searching for the lowest edit distance and recording the edit operations with dynamic programming. We customise the edit costs function to always postpone insertions to after deleting characters so that each input character can be assigned exactly one edit operation from the set {do nothing, delete character, insert string before character}. To capture insertions at the end of the tweet, we append a NULL byte to all tweets. The above setup, features and edit operations are identical to Chrupała (2014) to the best of our knowledge. We further add a character class feature {NULL, control, space, apostrophe, punctuation, digit, quote, bracket, lowercase letter, uppercase letter, non-ASCII, other} and a feature indicating whether the character is part of a token that is eligible for editing according to the shared task 1https://bitbucket.org/gchrupala/elman 2http://rnnlm.org/ 3https://bitbucket.org/gchrupala/ codeswitch/overview 4More precisely, we process UTF-8 bytes. For the training data, this is the same as characters as the training set does not contain any multi-byte UTF-8 characters. 5T</context>
<context position="6919" citStr="Chrupała (2014)" startWordPosition="1102" endWordPosition="1103">s encountered since the last space or start of tweet only are letters, digits, apostrophes and spaces. 2.3 Sequence Labelling For character-level sequence labelling, we try (a) Sequor6 (Chrupała and Klakow, 2010), an implementation of the generalised perceptron (Collins, 2002),7 with 10 iterations, and (b) Wapiti8 (Lavergne et al., 2010)’s implementation of conditional random fields (Lafferty et al., 2001) using l-bfgs optimisation with a history of 5 steps, elastic net regularisation (p1 = 0.333 and p2 = 0.001) and no hard limit on the number of iterations. We extend the feature templates of Chrupała (2014)9 by including our additional two features. The template generates unigram, bigram and trigram character features within a +/- 2 window. All remaining features are included as unigrams of the current value. Due to the nested cross-validation (see above), Sequor is trained on 64% (0.82) of the training data, 16% (0.8 x 0.2) is used as development set and 20% (1/5) for testing. For Wapiti, we use only 16% for training (and the remaining 64% for development set) in each cross-validation fold due to memory constraints.10 2.4 Generating Candidates We produce candidate normalisations from the edit o</context>
<context position="11344" citStr="Chrupała (2014)" startWordPosition="1855" endWordPosition="1856">ta. As we want to build character n-gram models and SRILM has no direct support for this, we re-format the candidate strings to make each character a token. To distinguish space characters from token separators, we represent them with double underscores. 2.7 Candidate Selection We use the noisy channel model12 to select the most plausible source sˆ for the observed target t from the set of candidates S(t): arg max P(t s)P(s) (2) s∈S(t) P(s) is provided by the language model (Section 2.6). Standard models give high probability to making few or no edits. However, we trust our sequence models as Chrupała (2014) reported encouraging results. Therefore, we give high probability to using the predicted edit operations. We consider two models for P(t s): { 0.979 if all edit operations are used 0.020 if s = t 0.001 otherwise P2 (t s) = f 1 if all edit operations are used 0 otherwise Note that P1 is not a proper probability model as there is never exactly one “otherwise” case but 2z −2 cases where i is the number of sections considered in candidate generation, causing the total to be either 0.999 or between 1.001 and 0.999 + 0.001 x (28 − 2) = 1.253. P2 effectively excludes the original input and all candi</context>
<context position="16711" citStr="Chrupała (2014)" startWordPosition="2773" endWordPosition="2774">herefore there is a tendency to reject edit operations that insert additional characters. Furthermore, we observe that our system often assigns inserted text to the wrong evaluation units, e.g. inserting the string “ laughing out” before the space before “lol” and then replacing second “L” of “lol” with “ud”. This is not wrong on the string 96 level, but in the token-level evaluation, we make two errors: wrongly appending “ laughing out” to the previous token and wrongly normalising “lol” to just “loud” instead of “laughing out loud”. Since the model P1 did not come out best, we cannot reject Chrupała (2014)’s hypothesis that the noisy channel model would not be useful. However, our observations also do not provide much support for this hypothesis as we did not include standard models from previous work (Cook and Stevenson, 2009; Han et al., 2013) in our experiment. 5 Conclusions We trained two sequence modellers to predict edit operations that normalise input text when executed and experimented with applying the noisy channel model to selecting candidate normalisation strings. Future work should: • Train the CRF on the full training data, either using a more memory-friendly (but possibly slower)</context>
</contexts>
<marker>Chrupała, 2014</marker>
<rawString>Grzegorz Chrupała. 2014. Normalizing tweets with edit scripts and recurrent neural embeddings. In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pages 680–686, Baltimore, Maryland, June. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Michael Collins</author>
</authors>
<title>Discriminative training methods for hidden Markov models: Theory and experiments with perceptron algorithms.</title>
<date>2002</date>
<booktitle>In Proceedings of the 2002 Conference on Empirical Methods in Natural Language Processing (EMNLP’02),</booktitle>
<pages>1--8</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Morristown, NJ, USA,</location>
<contexts>
<context position="6581" citStr="Collins, 2002" startWordPosition="1045" endWordPosition="1046">or the training data, this is the same as characters as the training set does not contain any multi-byte UTF-8 characters. 5These RNN-LM hidden layer activation features have been used successfully in text segmentation and word-level language identification (Chrupała, 2013; Barman et al., 2014). rules, i.e. whether or not the characters encountered since the last space or start of tweet only are letters, digits, apostrophes and spaces. 2.3 Sequence Labelling For character-level sequence labelling, we try (a) Sequor6 (Chrupała and Klakow, 2010), an implementation of the generalised perceptron (Collins, 2002),7 with 10 iterations, and (b) Wapiti8 (Lavergne et al., 2010)’s implementation of conditional random fields (Lafferty et al., 2001) using l-bfgs optimisation with a history of 5 steps, elastic net regularisation (p1 = 0.333 and p2 = 0.001) and no hard limit on the number of iterations. We extend the feature templates of Chrupała (2014)9 by including our additional two features. The template generates unigram, bigram and trigram character features within a +/- 2 window. All remaining features are included as unigrams of the current value. Due to the nested cross-validation (see above), Sequor </context>
</contexts>
<marker>Collins, 2002</marker>
<rawString>Michael Collins. 2002. Discriminative training methods for hidden Markov models: Theory and experiments with perceptron algorithms. In Proceedings of the 2002 Conference on Empirical Methods in Natural Language Processing (EMNLP’02), pages 1–8, Morristown, NJ, USA, July. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Paul Cook</author>
<author>Suzanne Stevenson</author>
</authors>
<title>An unsupervised model for text message normalization.</title>
<date>2009</date>
<booktitle>In Proceedings of the Workshop on Computational Approaches to Linguistic Creativity,</booktitle>
<pages>71--78</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Boulder, Colorado,</location>
<contexts>
<context position="16936" citStr="Cook and Stevenson, 2009" startWordPosition="2808" endWordPosition="2811"> laughing out” before the space before “lol” and then replacing second “L” of “lol” with “ud”. This is not wrong on the string 96 level, but in the token-level evaluation, we make two errors: wrongly appending “ laughing out” to the previous token and wrongly normalising “lol” to just “loud” instead of “laughing out loud”. Since the model P1 did not come out best, we cannot reject Chrupała (2014)’s hypothesis that the noisy channel model would not be useful. However, our observations also do not provide much support for this hypothesis as we did not include standard models from previous work (Cook and Stevenson, 2009; Han et al., 2013) in our experiment. 5 Conclusions We trained two sequence modellers to predict edit operations that normalise input text when executed and experimented with applying the noisy channel model to selecting candidate normalisation strings. Future work should: • Train the CRF on the full training data, either using a more memory-friendly (but possibly slower) optimisation method or using an even larger machine. • Experiment with LSTM sequence modelling (Hochreiter and Schmidhuber, 1997; Gers, 2001), which has been applied successfully to speech recognition and caption generation </context>
</contexts>
<marker>Cook, Stevenson, 2009</marker>
<rawString>Paul Cook and Suzanne Stevenson. 2009. An unsupervised model for text message normalization. In Proceedings of the Workshop on Computational Approaches to Linguistic Creativity, pages 71–78, Boulder, Colorado, June. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Felix Gers</author>
</authors>
<title>Long Short-Term Memory in Recurrent Neural Networks.</title>
<date>2001</date>
<booktitle>Ph.D. thesis, ´Ecole Polytechnique F´ed´erale de Lausanne, D´epartement d’Informatique,</booktitle>
<location>Lausanne.</location>
<note>http:// www.felixgers.de/papers/phd.pdf.</note>
<contexts>
<context position="17453" citStr="Gers, 2001" startWordPosition="2890" endWordPosition="2891"> hypothesis as we did not include standard models from previous work (Cook and Stevenson, 2009; Han et al., 2013) in our experiment. 5 Conclusions We trained two sequence modellers to predict edit operations that normalise input text when executed and experimented with applying the noisy channel model to selecting candidate normalisation strings. Future work should: • Train the CRF on the full training data, either using a more memory-friendly (but possibly slower) optimisation method or using an even larger machine. • Experiment with LSTM sequence modelling (Hochreiter and Schmidhuber, 1997; Gers, 2001), which has been applied successfully to speech recognition and caption generation (Graves and Jaitly, 2014; Vinyals et al., 2015). • Combine models with voting rather than language model score. • For the noisy channel model, try standard models from previous work (Cook and Stevenson, 2009; Han et al., 2013). • To better understand the selection preferences of the noisy channel model, compare the F1-score obtained when evaluating against the gold data to the F1-score obtained when evaluating the system output against its own input, i.e. are we biased towards doing nothing? • Introduce a brevit</context>
</contexts>
<marker>Gers, 2001</marker>
<rawString>Felix Gers. 2001. Long Short-Term Memory in Recurrent Neural Networks. Ph.D. thesis, ´Ecole Polytechnique F´ed´erale de Lausanne, D´epartement d’Informatique, Lausanne. Switzerland. http:// www.felixgers.de/papers/phd.pdf.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Alex Graves</author>
<author>Navdeep Jaitly</author>
</authors>
<title>Towards end-to-end speech recognition with recurrent neural networks.</title>
<date>2014</date>
<booktitle>Proceedings of The 31st International Conference on Machine Learning, volume 32 of JMLR Workshop and Conference Proceedings. http: //jmlr.org/proceedings/papers/v32/.</booktitle>
<editor>In Eric P. Xing and Tony Jebara, editors,</editor>
<contexts>
<context position="17560" citStr="Graves and Jaitly, 2014" startWordPosition="2904" endWordPosition="2907"> Han et al., 2013) in our experiment. 5 Conclusions We trained two sequence modellers to predict edit operations that normalise input text when executed and experimented with applying the noisy channel model to selecting candidate normalisation strings. Future work should: • Train the CRF on the full training data, either using a more memory-friendly (but possibly slower) optimisation method or using an even larger machine. • Experiment with LSTM sequence modelling (Hochreiter and Schmidhuber, 1997; Gers, 2001), which has been applied successfully to speech recognition and caption generation (Graves and Jaitly, 2014; Vinyals et al., 2015). • Combine models with voting rather than language model score. • For the noisy channel model, try standard models from previous work (Cook and Stevenson, 2009; Han et al., 2013). • To better understand the selection preferences of the noisy channel model, compare the F1-score obtained when evaluating against the gold data to the F1-score obtained when evaluating the system output against its own input, i.e. are we biased towards doing nothing? • Introduce a brevity penalty to counter the effect of selecting short candidate normalisations in the noisy channel model. • A</context>
</contexts>
<marker>Graves, Jaitly, 2014</marker>
<rawString>Alex Graves and Navdeep Jaitly. 2014. Towards end-to-end speech recognition with recurrent neural networks. In Eric P. Xing and Tony Jebara, editors, Proceedings of The 31st International Conference on Machine Learning, volume 32 of JMLR Workshop and Conference Proceedings. http: //jmlr.org/proceedings/papers/v32/.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Bo Han</author>
<author>Paul Cook</author>
<author>Timothy Baldwin</author>
</authors>
<title>Lexical normalization for social media text.</title>
<date>2013</date>
<journal>ACM Transactions on Intelligent</journal>
<pages>4--1</pages>
<contexts>
<context position="12439" citStr="Han et al. (2013)" startWordPosition="2044" endWordPosition="2047">ther 0.999 or between 1.001 and 0.999 + 0.001 x (28 − 2) = 1.253. P2 effectively excludes the original input and all candidates that use only some but not all of the edit operations suggested by the sequence labellers. Since there are five sequence labellers per cross-validation fold due to nested cross-validation and 25 sequence labellers during testing, P2 effectively selects between 5 or 25 candidates.13 12The noisy channel model has been applied successfully to spelling correction (Kemighan et al., 1990; WilcoxO’Hearn et al., 2008) and machine translation (Way, 2010), among other areas. 13Han et al. (2013) also use a trigram language model for normalisation, but only to reduce a larger candidate set to an P1(t s) = and 95 2 3 4 5 6 WB 14.70 9.97 7.91 7.31 7.19 KN 14.73 9.83 7.81 7.33 7.43 GT 14.63 9.88 7.91 7.45 7.44 Table 1: Average language model perplexity over the five cross-validation runs for n-gram sizes n = 2,..., 6 and smoothing methods WB = WittenBell, KN = Keyser-Ney and GT = Good-Turing. Standard deviation Q G 0.23 for all configurations. 2.8 Evaluation Measures We evaluate our best systems using the evalution script provided by the shared task organisers. It counts: • The number of</context>
<context position="16955" citStr="Han et al., 2013" startWordPosition="2812" endWordPosition="2815">space before “lol” and then replacing second “L” of “lol” with “ud”. This is not wrong on the string 96 level, but in the token-level evaluation, we make two errors: wrongly appending “ laughing out” to the previous token and wrongly normalising “lol” to just “loud” instead of “laughing out loud”. Since the model P1 did not come out best, we cannot reject Chrupała (2014)’s hypothesis that the noisy channel model would not be useful. However, our observations also do not provide much support for this hypothesis as we did not include standard models from previous work (Cook and Stevenson, 2009; Han et al., 2013) in our experiment. 5 Conclusions We trained two sequence modellers to predict edit operations that normalise input text when executed and experimented with applying the noisy channel model to selecting candidate normalisation strings. Future work should: • Train the CRF on the full training data, either using a more memory-friendly (but possibly slower) optimisation method or using an even larger machine. • Experiment with LSTM sequence modelling (Hochreiter and Schmidhuber, 1997; Gers, 2001), which has been applied successfully to speech recognition and caption generation (Graves and Jaitly,</context>
</contexts>
<marker>Han, Cook, Baldwin, 2013</marker>
<rawString>Bo Han, Paul Cook, and Timothy Baldwin. 2013. Lexical normalization for social media text. ACM Transactions on Intelligent Systems and Technology (TIST) - Special section on twitter and microblogging services, social recommender systems, and CAMRa2010: Movie recommendation in context archive, 4(1):5:1–5:27. doi 10.1145/2414425.2414430.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Sepp Hochreiter</author>
<author>J¨urgen Schmidhuber</author>
</authors>
<title>Long short-term memory.</title>
<date>1997</date>
<journal>Neural Computation,</journal>
<volume>9</volume>
<issue>8</issue>
<pages>10--1162</pages>
<contexts>
<context position="17440" citStr="Hochreiter and Schmidhuber, 1997" startWordPosition="2886" endWordPosition="2889"> not provide much support for this hypothesis as we did not include standard models from previous work (Cook and Stevenson, 2009; Han et al., 2013) in our experiment. 5 Conclusions We trained two sequence modellers to predict edit operations that normalise input text when executed and experimented with applying the noisy channel model to selecting candidate normalisation strings. Future work should: • Train the CRF on the full training data, either using a more memory-friendly (but possibly slower) optimisation method or using an even larger machine. • Experiment with LSTM sequence modelling (Hochreiter and Schmidhuber, 1997; Gers, 2001), which has been applied successfully to speech recognition and caption generation (Graves and Jaitly, 2014; Vinyals et al., 2015). • Combine models with voting rather than language model score. • For the noisy channel model, try standard models from previous work (Cook and Stevenson, 2009; Han et al., 2013). • To better understand the selection preferences of the noisy channel model, compare the F1-score obtained when evaluating against the gold data to the F1-score obtained when evaluating the system output against its own input, i.e. are we biased towards doing nothing? • Intro</context>
</contexts>
<marker>Hochreiter, Schmidhuber, 1997</marker>
<rawString>Sepp Hochreiter and J¨urgen Schmidhuber. 1997. Long short-term memory. Neural Computation, 9(8):1735–1780. doi:10.1162/neco.1997.9.8.1735.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Mark D Kemighan</author>
<author>Kenneth W Church</author>
<author>William A Gale</author>
</authors>
<title>A spelling correction program based on a noisy channel model.</title>
<date>1990</date>
<booktitle>COLING-90: Papers presented to the 13th International Conference on Computational Linguistics on the occasion of the 25th Anniversary of COLING and the 350th Anniversary of</booktitle>
<volume>2</volume>
<pages>90</pages>
<editor>In Hans Karlgren, editor,</editor>
<institution>Helsinki University,</institution>
<contexts>
<context position="12334" citStr="Kemighan et al., 1990" startWordPosition="2027" endWordPosition="2030"> 2z −2 cases where i is the number of sections considered in candidate generation, causing the total to be either 0.999 or between 1.001 and 0.999 + 0.001 x (28 − 2) = 1.253. P2 effectively excludes the original input and all candidates that use only some but not all of the edit operations suggested by the sequence labellers. Since there are five sequence labellers per cross-validation fold due to nested cross-validation and 25 sequence labellers during testing, P2 effectively selects between 5 or 25 candidates.13 12The noisy channel model has been applied successfully to spelling correction (Kemighan et al., 1990; WilcoxO’Hearn et al., 2008) and machine translation (Way, 2010), among other areas. 13Han et al. (2013) also use a trigram language model for normalisation, but only to reduce a larger candidate set to an P1(t s) = and 95 2 3 4 5 6 WB 14.70 9.97 7.91 7.31 7.19 KN 14.73 9.83 7.81 7.33 7.43 GT 14.63 9.88 7.91 7.45 7.44 Table 1: Average language model perplexity over the five cross-validation runs for n-gram sizes n = 2,..., 6 and smoothing methods WB = WittenBell, KN = Keyser-Ney and GT = Good-Turing. Standard deviation Q G 0.23 for all configurations. 2.8 Evaluation Measures We evaluate our b</context>
</contexts>
<marker>Kemighan, Church, Gale, 1990</marker>
<rawString>Mark D. Kemighan, Kenneth W. Church, and William A. Gale. 1990. A spelling correction program based on a noisy channel model. In Hans Karlgren, editor, COLING-90: Papers presented to the 13th International Conference on Computational Linguistics on the occasion of the 25th Anniversary of COLING and the 350th Anniversary of Helsinki University, Volume 2. http://www.aclweb. org/anthology/C/C90/.</rawString>
</citation>
<citation valid="true">
<authors>
<author>John Lafferty</author>
<author>Andrew McCallum</author>
<author>Fernando CN Pereira</author>
</authors>
<title>Conditional random fields: Probabilistic models for segmenting and labeling sequence data.</title>
<date>2001</date>
<pages>282--289</pages>
<contexts>
<context position="6713" citStr="Lafferty et al., 2001" startWordPosition="1063" endWordPosition="1066">These RNN-LM hidden layer activation features have been used successfully in text segmentation and word-level language identification (Chrupała, 2013; Barman et al., 2014). rules, i.e. whether or not the characters encountered since the last space or start of tweet only are letters, digits, apostrophes and spaces. 2.3 Sequence Labelling For character-level sequence labelling, we try (a) Sequor6 (Chrupała and Klakow, 2010), an implementation of the generalised perceptron (Collins, 2002),7 with 10 iterations, and (b) Wapiti8 (Lavergne et al., 2010)’s implementation of conditional random fields (Lafferty et al., 2001) using l-bfgs optimisation with a history of 5 steps, elastic net regularisation (p1 = 0.333 and p2 = 0.001) and no hard limit on the number of iterations. We extend the feature templates of Chrupała (2014)9 by including our additional two features. The template generates unigram, bigram and trigram character features within a +/- 2 window. All remaining features are included as unigrams of the current value. Due to the nested cross-validation (see above), Sequor is trained on 64% (0.82) of the training data, 16% (0.8 x 0.2) is used as development set and 20% (1/5) for testing. For Wapiti, we </context>
</contexts>
<marker>Lafferty, McCallum, Pereira, 2001</marker>
<rawString>John Lafferty, Andrew McCallum, and Fernando CN Pereira. 2001. Conditional random fields: Probabilistic models for segmenting and labeling sequence data. pages 282–289.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Thomas Lavergne</author>
<author>Olivier Capp´e</author>
<author>Franc¸ois Yvon</author>
</authors>
<title>Practical very large scale crfs.</title>
<date>2010</date>
<booktitle>In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics,</booktitle>
<pages>504--513</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Uppsala, Sweden,</location>
<marker>Lavergne, Capp´e, Yvon, 2010</marker>
<rawString>Thomas Lavergne, Olivier Capp´e, and Franc¸ois Yvon. 2010. Practical very large scale crfs. In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics, pages 504–513, Uppsala, Sweden, July. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="false">
<authors>
<author>T Mikolov</author>
<author>M Karafi´at</author>
<author>L Burget</author>
<author>J ˇCernock´y</author>
<author>S Khudanpur</author>
</authors>
<title>Recurrent neural network based language model.</title>
<date>2010</date>
<booktitle>In Proceedings of the 11th Annual Conference of the International Speech Communication Association (INTERSPEECH 2010), Makuhari, Chiba, Japan. International Speech Communication Association (ICSA). http://www.fit.vutbr.cz/research/ groups/speech/publi/2010/mikolov_ interspeech2010_IS100722.pdf.</booktitle>
<marker>Mikolov, Karafi´at, Burget, ˇCernock´y, Khudanpur, 2010</marker>
<rawString>T. Mikolov, M. Karafi´at, L. Burget, J. ˇCernock´y, and S. Khudanpur. 2010. Recurrent neural network based language model. In Proceedings of the 11th Annual Conference of the International Speech Communication Association (INTERSPEECH 2010), Makuhari, Chiba, Japan. International Speech Communication Association (ICSA). http://www.fit.vutbr.cz/research/ groups/speech/publi/2010/mikolov_ interspeech2010_IS100722.pdf.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Tom´aˇs Mikolov</author>
</authors>
<title>Statistical Language Models based on Neural Networks.</title>
<date>2012</date>
<tech>Ph.D. thesis,</tech>
<institution>Brno University of Technology, Faculty of Information Technology, Department of Computer Graphics and Multimedia,</institution>
<location>Brno, Czech Republic.</location>
<note>http://www.fit.vutbr.cz/˜imikolov/ rnnlm/thesis.pdf.</note>
<contexts>
<context position="4202" citStr="Mikolov, 2012" startWordPosition="675" endWordPosition="676">ystems are trained on four sets and tested on the remain93 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 93–98, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics ing set. Since the sequence labellers require a development set, we split the union of the four sets again into 5 sets to carry out nested crossvalidation, training 25 models in total for each system. 2.2 Feature Extraction For extracting recurrent neural network language model features, we use Elman1 (Chrupała, 2014), a modification of the RNNLM toolkit2 (Mikolov et al., 2010; Mikolov, 2012) that outputs hidden layer activations. We use the off-the-shelf model from Chrupała (2014)3. The input are the characters of the tweet4 in one-hot encoding. The network has a hidden layer with 400 neurons and it predicts the next byte. Following Chrupała (2014), we reduce the 400 activations to 10 binary features: We select the 10 most active neurons in order and apply a threshold (0.5) to the activation. The value of the i-th feature expresses which neuron was i-th active and whether its activation was below 0.5, e.g. the first feature states which neuron is most active and whether or not it</context>
</contexts>
<marker>Mikolov, 2012</marker>
<rawString>Tom´aˇs Mikolov. 2012. Statistical Language Models based on Neural Networks. Ph.D. thesis, Brno University of Technology, Faculty of Information Technology, Department of Computer Graphics and Multimedia, Brno, Czech Republic. http://www.fit.vutbr.cz/˜imikolov/ rnnlm/thesis.pdf.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Andreas Stolcke</author>
</authors>
<title>SRILM — an extensible language modeling toolkit. In</title>
<date>2002</date>
<booktitle>Proceedings of the 7th International Conference on Spoken Language Processing (ICSLP2002),</booktitle>
<volume>2</volume>
<pages>901--904</pages>
<editor>John H. L. Hansen and Bryan Pellom, editors,</editor>
<publisher>International Speech Communication Association (ISCA).</publisher>
<location>Baixas, France.</location>
<contexts>
<context position="10685" citStr="Stolcke, 2002" startWordPosition="1741" endWordPosition="1743">nput tokens. 2. Apply insert operations recorded at the space between tokens and at the end of the tweet to the preceding token. 3. Apply delete operations at the space between tokens, moving the contents of the token to the right to the end of the token to the left, leaving behind an empty token. (Delete operations at the end-of-tweet marker are ignored.) Due to time constraints, we do not attempt to improve the alignment of output tokens to input tokens. 11Splitting the eight sections again would produce 216 = 65,536 candidates. 2.6 Language Modelling For language modelling, we train SRILM (Stolcke, 2002) on the normalised tweets of the training data. As we want to build character n-gram models and SRILM has no direct support for this, we re-format the candidate strings to make each character a token. To distinguish space characters from token separators, we represent them with double underscores. 2.7 Candidate Selection We use the noisy channel model12 to select the most plausible source sˆ for the observed target t from the set of candidates S(t): arg max P(t s)P(s) (2) s∈S(t) P(s) is provided by the language model (Section 2.6). Standard models give high probability to making few or no edit</context>
</contexts>
<marker>Stolcke, 2002</marker>
<rawString>Andreas Stolcke. 2002. SRILM — an extensible language modeling toolkit. In John H. L. Hansen and Bryan Pellom, editors, Proceedings of the 7th International Conference on Spoken Language Processing (ICSLP2002), volume 2, pages 901–904, Baixas, France. International Speech Communication Association (ISCA).</rawString>
</citation>
<citation valid="true">
<authors>
<author>Oriol Vinyals</author>
<author>Alexander Toshev</author>
<author>Samy Bengio</author>
<author>Dumitru Erhan</author>
</authors>
<title>Show and tell: A neural image caption generator. http://arxiv.org/ abs/1411.4555, to appear in Computer Vision and Pattern Recognition.</title>
<date>2015</date>
<contexts>
<context position="17583" citStr="Vinyals et al., 2015" startWordPosition="2908" endWordPosition="2911"> experiment. 5 Conclusions We trained two sequence modellers to predict edit operations that normalise input text when executed and experimented with applying the noisy channel model to selecting candidate normalisation strings. Future work should: • Train the CRF on the full training data, either using a more memory-friendly (but possibly slower) optimisation method or using an even larger machine. • Experiment with LSTM sequence modelling (Hochreiter and Schmidhuber, 1997; Gers, 2001), which has been applied successfully to speech recognition and caption generation (Graves and Jaitly, 2014; Vinyals et al., 2015). • Combine models with voting rather than language model score. • For the noisy channel model, try standard models from previous work (Cook and Stevenson, 2009; Han et al., 2013). • To better understand the selection preferences of the noisy channel model, compare the F1-score obtained when evaluating against the gold data to the F1-score obtained when evaluating the system output against its own input, i.e. are we biased towards doing nothing? • Introduce a brevity penalty to counter the effect of selecting short candidate normalisations in the noisy channel model. • Automatically revise the</context>
</contexts>
<marker>Vinyals, Toshev, Bengio, Erhan, 2015</marker>
<rawString>Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. 2015. Show and tell: A neural image caption generator. http://arxiv.org/ abs/1411.4555, to appear in Computer Vision and Pattern Recognition.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Andy Way</author>
</authors>
<title>Machine translation. In Alexander</title>
<date>2010</date>
<booktitle>The Handbook of Computational Linguistics and Natural Language Processing,</booktitle>
<pages>531--573</pages>
<editor>Clark, Chris Fox, and Shalom Lappin, editors,</editor>
<publisher>Wiley Blackwell,</publisher>
<location>Chichester, UK,</location>
<contexts>
<context position="12399" citStr="Way, 2010" startWordPosition="2039" endWordPosition="2040">ation, causing the total to be either 0.999 or between 1.001 and 0.999 + 0.001 x (28 − 2) = 1.253. P2 effectively excludes the original input and all candidates that use only some but not all of the edit operations suggested by the sequence labellers. Since there are five sequence labellers per cross-validation fold due to nested cross-validation and 25 sequence labellers during testing, P2 effectively selects between 5 or 25 candidates.13 12The noisy channel model has been applied successfully to spelling correction (Kemighan et al., 1990; WilcoxO’Hearn et al., 2008) and machine translation (Way, 2010), among other areas. 13Han et al. (2013) also use a trigram language model for normalisation, but only to reduce a larger candidate set to an P1(t s) = and 95 2 3 4 5 6 WB 14.70 9.97 7.91 7.31 7.19 KN 14.73 9.83 7.81 7.33 7.43 GT 14.63 9.88 7.91 7.45 7.44 Table 1: Average language model perplexity over the five cross-validation runs for n-gram sizes n = 2,..., 6 and smoothing methods WB = WittenBell, KN = Keyser-Ney and GT = Good-Turing. Standard deviation Q G 0.23 for all configurations. 2.8 Evaluation Measures We evaluate our best systems using the evalution script provided by the shared tas</context>
</contexts>
<marker>Way, 2010</marker>
<rawString>Andy Way. 2010. Machine translation. In Alexander Clark, Chris Fox, and Shalom Lappin, editors, The Handbook of Computational Linguistics and Natural Language Processing, pages 531–573. Wiley Blackwell, Chichester, UK, July.</rawString>
</citation>
<citation valid="false">
<authors>
<author>Amber Wilcox-O’Hearn</author>
<author>Graeme Hirst</author>
<author>Alexander Budanitsky</author>
</authors>
<title>Real-word spelling correction with trigrams: A reconsideration of the Mays, Damerau, and Mercer model.</title>
<date>2008</date>
<booktitle>Computational Linguistics and Intelligent Text Processing - 9th International Conference, CICLing 2008,</booktitle>
<volume>4919</volume>
<pages>605--616</pages>
<editor>In Alexander Gelbukh, editor,</editor>
<publisher>Springer Berlin/Heidelberg,</publisher>
<location>Haifa, Isarael,</location>
<marker>Wilcox-O’Hearn, Hirst, Budanitsky, 2008</marker>
<rawString>Amber Wilcox-O’Hearn, Graeme Hirst, and Alexander Budanitsky. 2008. Real-word spelling correction with trigrams: A reconsideration of the Mays, Damerau, and Mercer model. In Alexander Gelbukh, editor, Computational Linguistics and Intelligent Text Processing - 9th International Conference, CICLing 2008, Haifa, Isarael, February 17–23, 2008 - Proceedings, volume 4919/2008, pages 605–616. Springer Berlin/Heidelberg,</rawString>
</citation>
<citation valid="true">
<authors>
<author>Germany</author>
</authors>
<title>draft version available on http://ftp.cs.toronto.edu/pub/gh/WilcoxOHearnetal-2006.pdf.</title>
<date>2006</date>
<marker>Germany, 2006</marker>
<rawString>Germany. 2006 draft version available on http://ftp.cs.toronto.edu/pub/gh/WilcoxOHearnetal-2006.pdf.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Yue Zhang</author>
<author>Stephen Clark</author>
</authors>
<title>Syntactic processing using the generalized perceptron and beam search.</title>
<date>2011</date>
<journal>Computational Linguistics,</journal>
<volume>37</volume>
<issue>1</issue>
<contexts>
<context position="8267" citStr="Zhang and Clark, 2011" startWordPosition="1323" endWordPosition="1326">, we would produce up to 2N candidates, where N is the number of edit operations. With N = 140 (maximum lengths of a tweet), handling these many candidates is not feasible. Instead, we recursively split the sequence of edit operations produced by the sequence labeller into up to eight sections. To find good split points, we propose to minimise 11 |eL − eR |+ max({0,10 − s})/2 (1) 6https://bitbucket.org/gchrupala/ sequor 7The generalised perceptron has been shown to match performance of state-of-the-art methods in word segmentation, POS tagging, dependency parsing and phrase-structure parsing (Zhang and Clark, 2011). 8https://wapiti.limsi.fr/ 9We thank Grzegorz Chrupała for providing his template and for translating it to the Sequor template format. 10With 64%, memory usage grew to over 400 GB over night, causing heavy swap activity on our machines with 256 GB RAM (and 410 GB swap space). 94 where eL and eR are the number of insert or delete operations to the left and right respectively, and s is the number of consecutive no-operations to the left. The first term tries to balance the number of edit operations on each side while the second term introduces a preference to not split clusters of edit operati</context>
</contexts>
<marker>Zhang, Clark, 2011</marker>
<rawString>Yue Zhang and Stephen Clark. 2011. Syntactic processing using the generalized perceptron and beam search. Computational Linguistics, 37(1):105–151.</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>