<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.027165">
<title confidence="0.9821475">
Lattice: Data Adaptation for Named Entity Recognition on Tweets with
Features-Rich CRF
</title>
<note confidence="0.759236">
Tian TIAN Marco Dinarelli Isabelle TELLIER
Lattice / 1 Maurice Arnoux Lattice / 1 Maurice Arnoux Lattice / 1 Maurice Arnoux
92120 MONTROUGE 92120 MONTROUGE 92120 MONTROUGE
</note>
<email confidence="0.975326">
tian.tian@live.cn marco.dinarelli@ens.fr isabelle.tellier@univ-paris3.fr
</email>
<sectionHeader confidence="0.993244" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.999959153846154">
This article describes our CRF named en-
tity extractor for Twitter data. We first dis-
cuss some specificities of the task, with an
example found in the training data. Then
we present how we built our CRF model,
especially the way features were defined.
The results of these first experiments are
given. We also tested our model with
dev 2015 data and we describe the pro-
cedure we have used to adapt older Twit-
ter data to the data available for this 2015
shared task. Our final results for the task
are discussed.
</bodyText>
<sectionHeader confidence="0.998801" genericHeader="introduction">
1 Introduction
</sectionHeader>
<bodyText confidence="0.999977066666667">
In this shared task, we have to extract 10 types of
(or not typed) named entities in Twitter data. We
have at our disposal two labelled corpora: train
and dev. The first section shows some specifici-
ties of the data, from an example it contains. We
then construct a CRF model for the task, using the
software Wapiti. Our features for this CRF are
chosen according to the state-of-the-art, they are
described in the second section. The third section
focuses on some experiments with train and dev
and gives the obtained results. The fourth section
is about the procedure we have used to build our fi-
nal model, by applying a domain adaptation strat-
egy. In the last section, we discuss some future
work for this shared task.
</bodyText>
<sectionHeader confidence="0.987517" genericHeader="method">
2 Data Analysis
</sectionHeader>
<bodyText confidence="0.999848833333333">
Although named entity recognition is a traditional
task of natural language processing (NLP) which
has given rise to a large body of works for writ-
ten English (Finkel et al., 2005) or news wires in
French (Stern and Sagot, 2010), the same task with
Twitter data remains difficult (Ritter et al., 2011).
</bodyText>
<figure confidence="0.329261">
Today wasz Fun cusz anna Came juss for me &lt;3: hahaha
</figure>
<figureCaption confidence="0.998895">
Figure 1: An example of tweet
</figureCaption>
<bodyText confidence="0.997502">
This is not only because of the task itself, but also
because of the way tweets are written.
Figure 1 shows an example of tweet. The cor-
rect sentence should be: Today was fun because
Anna came just for me &lt;3: hahaha. We can note
the following phenomena:
</bodyText>
<listItem confidence="0.999787">
• spelling mistakes: wasz (was), cusz (be-
cause), juss (just)
• confusion of upper/lower cases: Fun (fun),
anna (Anna), Came (came)
• emoticon: &lt;3
• interjection: hahaha
</listItem>
<bodyText confidence="0.9994688">
We remark here that the only name has no up-
per case letters whereas other words have upper
cases (like ”Fun”, ”Came”). So, it would be diffi-
cult for a named entity extractor to correctly detect
this person name.
</bodyText>
<sectionHeader confidence="0.958908" genericHeader="method">
3 CRF Implementation and Features
</sectionHeader>
<subsectionHeader confidence="0.997385">
3.1 CRF Features
</subsectionHeader>
<bodyText confidence="0.999493909090909">
We used the CRF implementation Wapiti 1.5.0 1
to create our CRF model. The optimization al-
gorithm we chose was rprop+. The features for
the tokens are all in unigrams and within a win-
dow of size 3 (previous token, current token and
next token). The bigrams are only made of labels,
characterizing label transitions. Table 1 shows
the features we implemented. These templates
have been chosen following (Suzuki and Isozaki,
2008), (Lavergne et al., 2010), (Nooralahzadeh et
al., 2014) and (Constant et al., 2011)
</bodyText>
<footnote confidence="0.99488">
1https://wapiti.limsi.fr
</footnote>
<page confidence="0.987891">
68
</page>
<note confidence="0.694716">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 68–71,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<bodyText confidence="0.970409485714286">
token value
fstUpper
shortCap
longCap
mixCap
hasUpper
allUpper
capType: combination of 6 binary values
allLetter
singleLetter
tokenType: punctuation, 9, x or X
hasNumber
allNumber
isDecimal
onePunct
allPunct
hasPunct
longPunct
hasQuotation
hasAtLeast2periodes
finishedByPeriode
hasDash
lower
returnUnicodeVector
isEmal
isURL
isRT
isUSR
isHashTag
isDate
isTime
isAbbrev
prefixe n, suffixe n (n = 1..5)
postag in PTB: with binary values
category in Brown cluster: in binary tree
</bodyText>
<tableCaption confidence="0.941061">
Table 1: CRF features
</tableCaption>
<bodyText confidence="0.999917384615385">
The capType features regroup 6 binary features:
allUpper, shortCap, longCap, allLower, fstUpper,
mixCap. The tokenType feature transforms a to-
ken into a ”skeleton”: in this skeleton, all num-
bers are replaced by 9, all letters in lower case by
x, all letters in upper case by X and the punctua-
tions remain unchanged. The part-of-speech tags
(postags) of the Penn Tree Bank (PTB) (Marcus et
al., 1993) generate 45 distinct features. Each tag
in the PTB becomes a feature with a binary value.
The ”category in Brown cluster” uses the result of
Brown clustering (Brown et al., 1992) executed
with 56,345,753 tweets available at http://
</bodyText>
<table confidence="0.999327">
precision recall FB1
dev 69.01% 33.15% 44.78%
dev 2015 43.26% 22.43% 29.54%
</table>
<tableCaption confidence="0.9592885">
Table 2: Experiment results with model trained on
train file
</tableCaption>
<bodyText confidence="0.995487272727273">
www.ark.cs.cmu.edu/TweetNLP/. The
class of each token is represented with 13 binary
values. These values represent therefore a binary
tree. Each value means one level in the binary
tree. So we took the first value for each token,
i.e. its category with only one level (two possible
values).We then took the first two values of each
token, resulting in the clustering of twitter tokens
into four classes, etc. We took until all 13 values,
to get the classes of the token at every level of the
binary tree.
</bodyText>
<subsectionHeader confidence="0.999681">
3.2 Use of Lexical Resources
</subsectionHeader>
<bodyText confidence="0.999989047619047">
As they were attached with the available base-
line, we processed a set of entity dictionaries. We
tried to associate these dictionaries with the 10
types of entities defined for the shared task. We
deleted duplicated data (as we kept only cap.1000
but not cap.10 nor others, etc). Then we read
every item of the lists. As some items (enti-
ties) contain more than one token, we extracted
the first tokens (or the only token for one-token-
entities) and the remaining ones before storing
them into different lists. So, for every dictio-
nary we had, we created 2 lists: a ”B-dictionary”
and a ”I-dictionary”, preparing the BIO label-
ings. Finally, we integrated these dictionaries into
the model by binary values. For each token, if
it is present in a dictionary (B-dictionary or I-
dictionary), its value for the corresponding fea-
ture is set to 1, and 0 otherwise. And we could
always try with other ressources like FreeBase
https://www.freebase.com/ and dbpedia
http://dbpedia.org/.
</bodyText>
<sectionHeader confidence="0.92754" genericHeader="method">
4 Some Experiments and Results
</sectionHeader>
<bodyText confidence="0.999846142857143">
With the templates defined in the previous sec-
tion, we used rprop+ as optimization algorithm
in Wapiti and we did some experiments (only
with the 10 distinct types of entities) with mod-
els trained with ”train” and tested on ”dev”, and
later tested on ”dev 2015”. Table 2 shows some of
these results.
</bodyText>
<page confidence="0.999436">
69
</page>
<sectionHeader confidence="0.988721" genericHeader="method">
5 CRF Model Training with Domain
</sectionHeader>
<subsectionHeader confidence="0.826951">
Data Adaptation
</subsectionHeader>
<bodyText confidence="0.999916">
As we can see in the previous section, our first
model performs poorly on dev 2105 data com-
pared to dev. This suggests that the data in
dev 2015 are very different from the data in dev
and train. This intuition has indeed been con-
firmed by a quick data analysis.
As a consequence, we had the idea to perform
a kind of domain data adaptation, inspired by the
work of (Raymond and Fayolle, 2010). In this
context, the data we want to adapt is called source
domain. In our case, train and dev data play the
role of this source domain. The role of target do-
main is played by the new version of tweet data
provided for the shared task, that is dev 2105 data.
The approach described in (Raymond and Fayolle,
2010) mixes together data from the source domain
and from the target domain in order to train a CRF
model. The originality of this approach consists
in using more CRF features for the part of the data
constituting the target domain than features for the
data constituting the source domain. The conse-
quence of this choice is that the CRF models learn
word-label dependencies from both domains, but
put much stronger importance (feature scores) on
features in the target domain, since they are de-
scribed by more information (features).
We annotated afterwards the training data,
which we have already seen during the training
phase, with such a model. If the model can ap-
ply stronger dependencies learned from the target-
domain part of the training data, it will apply such
dependencies performing thus the desired adap-
tation. Otherwise it will apply the dependencies
learned from the source-domain part of the train-
ing data, thus keeping the old annotation.
We only applied an approximation of this do-
main adaptation procedure of (Raymond and Fay-
olle, 2010), because of a serious lack of time. In
order to create our final model, we trained our first
CRF model (with the templates mentioned in the
previous section) with dev 2015. We then applied
this first CRF model to train and dev to obtain
train crf and dev crf. So, these data are labelled
with our first CRF model. We got rid of the orig-
inal labels for train and dev. And, in the end, we
trained our final model (always with the same tem-
plates) with dev 2015, train crf and dev crf all to-
gether. We did the same procedure for the 10 types
entities and for no typed data. Our results are de-
</bodyText>
<table confidence="0.996815666666667">
precision recall FB1
10 types 55.17% 9.68% 16.47%
no type 58.42% 25.72% 35.71%
</table>
<tableCaption confidence="0.953701">
Table 3: Results with model trained on dev 2015
then applied to train and dev files
</tableCaption>
<bodyText confidence="0.966409416666667">
scribed in Table 3.
Compared to results with dev 2015, we had a
better precision, which confirms that the adapta-
tion was worth doing. However we also had a
much worse recall, which could be someway pre-
dicted since the dev 2015 data is much smaller
than the training data. It thus creates a serious low
covering problem. Such problem can be overcome
by applying the exact adaptation procedure de-
scribed in (Raymond and Fayolle, 2010), together
with the use of more external resources (such as
name lists).
</bodyText>
<sectionHeader confidence="0.999847" genericHeader="conclusions">
6 Future work
</sectionHeader>
<bodyText confidence="0.999949555555556">
In the future, we could do some proper experi-
ments in cross validation with the training data,
in order to find better templates, and find the best
L1 and L2 regularization parameters of the CRF.
We believe that correctly performing the adapta-
tion procedure of (Raymond and Fayolle, 2010)
and thus obtaining a better CRF model for our
named entity extractor would lead to much better
results.
</bodyText>
<sectionHeader confidence="0.998577" genericHeader="references">
References
</sectionHeader>
<reference confidence="0.99965805">
Peter F. Brown, Peter V. deSouza, Robert L. Mer-
cer, Vincent J. Della Pietra, and Jenifer C. Lai.
1992. Class-based n-gram models of natural lan-
guage. Comput. Linguist., 18(4):467–479, Decem-
ber.
Matthieu Constant, Isabelle Tellier, Denys Duchier,
Yoann Dupont, Anthony Sigogne, and Sylvie Bil-
lot. 2011. Int´egrer des connaissances linguistiques
dans un CRF : application a` l’apprentissage d’un
segmenteur-´etiqueteur du franc¸ais. In TALN, vol-
ume 1, page 321, Montpellier, France, June.
Jenny R. Finkel, Trond Grenager, and Christopher
Manning. 2005. Incorporating non-local informa-
tion into information extraction systems by Gibbs
sampling. In Proceedings of the 43rd Annual Meet-
ing on Association for Computational Linguistics,
ACL ’05, pages 363–370, Stroudsburg, PA, USA.
Association for Computational Linguistics.
Thomas Lavergne, Olivier Capp´e, and Franc¸ois Yvon.
2010. Practical very large scale crfs. In Proceed-
</reference>
<page confidence="0.967769">
70
</page>
<reference confidence="0.998485375">
ings of the 48th Annual Meeting of the Association
for Computational Linguistics, ACL ’10, pages 504–
513, Stroudsburg, PA, USA. Association for Com-
putational Linguistics.
Mitchell P. Marcus, Beatrice Santorini, and Mary Ann
Marcinkiewicz. 1993. Building a large annotated
corpus of english: The penn treebank. COMPUTA-
TIONAL LINGUISTICS, 19(2):313–330.
Farhad Nooralahzadeh, Caroline Brun, and Claude
Roux. 2014. Part of speech tagging for french social
media data. In COLING 2014, 25th International
Conference on Computational Linguistics, Proceed-
ings of the Conference: Technical Papers, August
23-29, 2014, Dublin, Ireland, pages 1764–1772.
Christian Raymond and Julien Fayolle. 2010. Recon-
naissance robuste d’entit´es nomm´ees sur de la parole
transcrite automatiquement. In Conf´erence Traite-
ment automatique des langues naturelles, TALN’10,
Montr´eal, Qu´ebec, Canada, July. ATALA.
Alan Ritter, Sam Clark, Mausam, and Oren Etzioni.
2011. Named entity recognition in tweets: An ex-
perimental study. In Proceedings of the Conference
on Empirical Methods in Natural Language Pro-
cessing, EMNLP ’11, pages 1524–1534, Strouds-
burg, PA, USA. Association for Computational Lin-
guistics.
Rosa Stern and Benoit Sagot. 2010. Resources for
named entity recognition and resolution in news
wires. In Entity 2010 Workshop at LREC 2010.
Jun Suzuki and Hideki Isozaki. 2008. Semi-supervised
sequential labeling and segmentation using giga-
word scale unlabeled data. In In ACL.
</reference>
<page confidence="0.99913">
71
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.420368">
<title confidence="0.989797">Lattice: Data Adaptation for Named Entity Recognition on Tweets Features-Rich CRF</title>
<author confidence="0.728763">Tian TIAN Marco Dinarelli Isabelle TELLIER Lattice Maurice Arnoux Lattice Maurice Arnoux Lattice Maurice Arnoux</author>
<address confidence="0.986934">92120 MONTROUGE 92120 MONTROUGE 92120 MONTROUGE</address>
<email confidence="0.896422">tian.tian@live.cnmarco.dinarelli@ens.frisabelle.tellier@univ-paris3.fr</email>
<abstract confidence="0.999276857142857">This article describes our CRF named entity extractor for Twitter data. We first discuss some specificities of the task, with an example found in the training data. Then we present how we built our CRF model, especially the way features were defined. The results of these first experiments are given. We also tested our model with dev 2015 data and we describe the procedure we have used to adapt older Twitter data to the data available for this 2015 shared task. Our final results for the task are discussed.</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="true">
<authors>
<author>Peter F Brown</author>
<author>Peter V deSouza</author>
<author>Robert L Mercer</author>
<author>Vincent J Della Pietra</author>
<author>Jenifer C Lai</author>
</authors>
<title>Class-based n-gram models of natural language.</title>
<date>1992</date>
<journal>Comput. Linguist.,</journal>
<volume>18</volume>
<issue>4</issue>
<contexts>
<context position="4453" citStr="Brown et al., 1992" startWordPosition="730" endWordPosition="733">er: in binary tree Table 1: CRF features The capType features regroup 6 binary features: allUpper, shortCap, longCap, allLower, fstUpper, mixCap. The tokenType feature transforms a token into a ”skeleton”: in this skeleton, all numbers are replaced by 9, all letters in lower case by x, all letters in upper case by X and the punctuations remain unchanged. The part-of-speech tags (postags) of the Penn Tree Bank (PTB) (Marcus et al., 1993) generate 45 distinct features. Each tag in the PTB becomes a feature with a binary value. The ”category in Brown cluster” uses the result of Brown clustering (Brown et al., 1992) executed with 56,345,753 tweets available at http:// precision recall FB1 dev 69.01% 33.15% 44.78% dev 2015 43.26% 22.43% 29.54% Table 2: Experiment results with model trained on train file www.ark.cs.cmu.edu/TweetNLP/. The class of each token is represented with 13 binary values. These values represent therefore a binary tree. Each value means one level in the binary tree. So we took the first value for each token, i.e. its category with only one level (two possible values).We then took the first two values of each token, resulting in the clustering of twitter tokens into four classes, etc. </context>
</contexts>
<marker>Brown, deSouza, Mercer, Pietra, Lai, 1992</marker>
<rawString>Peter F. Brown, Peter V. deSouza, Robert L. Mercer, Vincent J. Della Pietra, and Jenifer C. Lai. 1992. Class-based n-gram models of natural language. Comput. Linguist., 18(4):467–479, December.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Matthieu Constant</author>
<author>Isabelle Tellier</author>
<author>Denys Duchier</author>
<author>Yoann Dupont</author>
<author>Anthony Sigogne</author>
<author>Sylvie Billot</author>
</authors>
<title>Int´egrer des connaissances linguistiques dans un CRF : application a` l’apprentissage d’un segmenteur-´etiqueteur du franc¸ais.</title>
<date>2011</date>
<booktitle>In TALN,</booktitle>
<volume>1</volume>
<pages>321</pages>
<location>Montpellier, France,</location>
<contexts>
<context position="3187" citStr="Constant et al., 2011" startWordPosition="541" endWordPosition="544">amed entity extractor to correctly detect this person name. 3 CRF Implementation and Features 3.1 CRF Features We used the CRF implementation Wapiti 1.5.0 1 to create our CRF model. The optimization algorithm we chose was rprop+. The features for the tokens are all in unigrams and within a window of size 3 (previous token, current token and next token). The bigrams are only made of labels, characterizing label transitions. Table 1 shows the features we implemented. These templates have been chosen following (Suzuki and Isozaki, 2008), (Lavergne et al., 2010), (Nooralahzadeh et al., 2014) and (Constant et al., 2011) 1https://wapiti.limsi.fr 68 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 68–71, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics token value fstUpper shortCap longCap mixCap hasUpper allUpper capType: combination of 6 binary values allLetter singleLetter tokenType: punctuation, 9, x or X hasNumber allNumber isDecimal onePunct allPunct hasPunct longPunct hasQuotation hasAtLeast2periodes finishedByPeriode hasDash lower returnUnicodeVector isEmal isURL isRT isUSR isHashTag isDate isTime isAbbrev prefixe n, suffixe n (n = 1..5) postag in </context>
</contexts>
<marker>Constant, Tellier, Duchier, Dupont, Sigogne, Billot, 2011</marker>
<rawString>Matthieu Constant, Isabelle Tellier, Denys Duchier, Yoann Dupont, Anthony Sigogne, and Sylvie Billot. 2011. Int´egrer des connaissances linguistiques dans un CRF : application a` l’apprentissage d’un segmenteur-´etiqueteur du franc¸ais. In TALN, volume 1, page 321, Montpellier, France, June.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Jenny R Finkel</author>
<author>Trond Grenager</author>
<author>Christopher Manning</author>
</authors>
<title>Incorporating non-local information into information extraction systems by Gibbs sampling.</title>
<date>2005</date>
<booktitle>In Proceedings of the 43rd Annual Meeting on Association for Computational Linguistics, ACL ’05,</booktitle>
<pages>363--370</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA.</location>
<contexts>
<context position="1784" citStr="Finkel et al., 2005" startWordPosition="296" endWordPosition="299">e software Wapiti. Our features for this CRF are chosen according to the state-of-the-art, they are described in the second section. The third section focuses on some experiments with train and dev and gives the obtained results. The fourth section is about the procedure we have used to build our final model, by applying a domain adaptation strategy. In the last section, we discuss some future work for this shared task. 2 Data Analysis Although named entity recognition is a traditional task of natural language processing (NLP) which has given rise to a large body of works for written English (Finkel et al., 2005) or news wires in French (Stern and Sagot, 2010), the same task with Twitter data remains difficult (Ritter et al., 2011). Today wasz Fun cusz anna Came juss for me &lt;3: hahaha Figure 1: An example of tweet This is not only because of the task itself, but also because of the way tweets are written. Figure 1 shows an example of tweet. The correct sentence should be: Today was fun because Anna came just for me &lt;3: hahaha. We can note the following phenomena: • spelling mistakes: wasz (was), cusz (because), juss (just) • confusion of upper/lower cases: Fun (fun), anna (Anna), Came (came) • emotico</context>
</contexts>
<marker>Finkel, Grenager, Manning, 2005</marker>
<rawString>Jenny R. Finkel, Trond Grenager, and Christopher Manning. 2005. Incorporating non-local information into information extraction systems by Gibbs sampling. In Proceedings of the 43rd Annual Meeting on Association for Computational Linguistics, ACL ’05, pages 363–370, Stroudsburg, PA, USA. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Thomas Lavergne</author>
<author>Olivier Capp´e</author>
<author>Franc¸ois Yvon</author>
</authors>
<title>Practical very large scale crfs.</title>
<date>2010</date>
<booktitle>In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics, ACL ’10,</booktitle>
<pages>504--513</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA.</location>
<marker>Lavergne, Capp´e, Yvon, 2010</marker>
<rawString>Thomas Lavergne, Olivier Capp´e, and Franc¸ois Yvon. 2010. Practical very large scale crfs. In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics, ACL ’10, pages 504– 513, Stroudsburg, PA, USA. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Mitchell P Marcus</author>
<author>Beatrice Santorini</author>
<author>Mary Ann Marcinkiewicz</author>
</authors>
<title>Building a large annotated corpus of english: The penn treebank.</title>
<date>1993</date>
<journal>COMPUTATIONAL LINGUISTICS,</journal>
<volume>19</volume>
<issue>2</issue>
<contexts>
<context position="4274" citStr="Marcus et al., 1993" startWordPosition="699" endWordPosition="702">hasDash lower returnUnicodeVector isEmal isURL isRT isUSR isHashTag isDate isTime isAbbrev prefixe n, suffixe n (n = 1..5) postag in PTB: with binary values category in Brown cluster: in binary tree Table 1: CRF features The capType features regroup 6 binary features: allUpper, shortCap, longCap, allLower, fstUpper, mixCap. The tokenType feature transforms a token into a ”skeleton”: in this skeleton, all numbers are replaced by 9, all letters in lower case by x, all letters in upper case by X and the punctuations remain unchanged. The part-of-speech tags (postags) of the Penn Tree Bank (PTB) (Marcus et al., 1993) generate 45 distinct features. Each tag in the PTB becomes a feature with a binary value. The ”category in Brown cluster” uses the result of Brown clustering (Brown et al., 1992) executed with 56,345,753 tweets available at http:// precision recall FB1 dev 69.01% 33.15% 44.78% dev 2015 43.26% 22.43% 29.54% Table 2: Experiment results with model trained on train file www.ark.cs.cmu.edu/TweetNLP/. The class of each token is represented with 13 binary values. These values represent therefore a binary tree. Each value means one level in the binary tree. So we took the first value for each token, </context>
</contexts>
<marker>Marcus, Santorini, Marcinkiewicz, 1993</marker>
<rawString>Mitchell P. Marcus, Beatrice Santorini, and Mary Ann Marcinkiewicz. 1993. Building a large annotated corpus of english: The penn treebank. COMPUTATIONAL LINGUISTICS, 19(2):313–330.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Farhad Nooralahzadeh</author>
<author>Caroline Brun</author>
<author>Claude Roux</author>
</authors>
<title>Part of speech tagging for french social media data.</title>
<date>2014</date>
<booktitle>In COLING 2014, 25th International Conference on Computational Linguistics, Proceedings of the Conference: Technical Papers,</booktitle>
<pages>1764--1772</pages>
<location>Dublin, Ireland,</location>
<contexts>
<context position="3159" citStr="Nooralahzadeh et al., 2014" startWordPosition="536" endWordPosition="539">So, it would be difficult for a named entity extractor to correctly detect this person name. 3 CRF Implementation and Features 3.1 CRF Features We used the CRF implementation Wapiti 1.5.0 1 to create our CRF model. The optimization algorithm we chose was rprop+. The features for the tokens are all in unigrams and within a window of size 3 (previous token, current token and next token). The bigrams are only made of labels, characterizing label transitions. Table 1 shows the features we implemented. These templates have been chosen following (Suzuki and Isozaki, 2008), (Lavergne et al., 2010), (Nooralahzadeh et al., 2014) and (Constant et al., 2011) 1https://wapiti.limsi.fr 68 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 68–71, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics token value fstUpper shortCap longCap mixCap hasUpper allUpper capType: combination of 6 binary values allLetter singleLetter tokenType: punctuation, 9, x or X hasNumber allNumber isDecimal onePunct allPunct hasPunct longPunct hasQuotation hasAtLeast2periodes finishedByPeriode hasDash lower returnUnicodeVector isEmal isURL isRT isUSR isHashTag isDate isTime isAbbrev prefixe n, suf</context>
</contexts>
<marker>Nooralahzadeh, Brun, Roux, 2014</marker>
<rawString>Farhad Nooralahzadeh, Caroline Brun, and Claude Roux. 2014. Part of speech tagging for french social media data. In COLING 2014, 25th International Conference on Computational Linguistics, Proceedings of the Conference: Technical Papers, August 23-29, 2014, Dublin, Ireland, pages 1764–1772.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Christian Raymond</author>
<author>Julien Fayolle</author>
</authors>
<title>Reconnaissance robuste d’entit´es nomm´ees sur de la parole transcrite automatiquement.</title>
<date>2010</date>
<booktitle>In Conf´erence Traitement automatique des langues naturelles, TALN’10,</booktitle>
<publisher>ATALA.</publisher>
<location>Montr´eal, Qu´ebec, Canada,</location>
<contexts>
<context position="6928" citStr="Raymond and Fayolle, 2010" startWordPosition="1155" endWordPosition="1158">experiments (only with the 10 distinct types of entities) with models trained with ”train” and tested on ”dev”, and later tested on ”dev 2015”. Table 2 shows some of these results. 69 5 CRF Model Training with Domain Data Adaptation As we can see in the previous section, our first model performs poorly on dev 2105 data compared to dev. This suggests that the data in dev 2015 are very different from the data in dev and train. This intuition has indeed been confirmed by a quick data analysis. As a consequence, we had the idea to perform a kind of domain data adaptation, inspired by the work of (Raymond and Fayolle, 2010). In this context, the data we want to adapt is called source domain. In our case, train and dev data play the role of this source domain. The role of target domain is played by the new version of tweet data provided for the shared task, that is dev 2105 data. The approach described in (Raymond and Fayolle, 2010) mixes together data from the source domain and from the target domain in order to train a CRF model. The originality of this approach consists in using more CRF features for the part of the data constituting the target domain than features for the data constituting the source domain. </context>
<context position="8293" citStr="Raymond and Fayolle, 2010" startWordPosition="1388" endWordPosition="1392">ature scores) on features in the target domain, since they are described by more information (features). We annotated afterwards the training data, which we have already seen during the training phase, with such a model. If the model can apply stronger dependencies learned from the targetdomain part of the training data, it will apply such dependencies performing thus the desired adaptation. Otherwise it will apply the dependencies learned from the source-domain part of the training data, thus keeping the old annotation. We only applied an approximation of this domain adaptation procedure of (Raymond and Fayolle, 2010), because of a serious lack of time. In order to create our final model, we trained our first CRF model (with the templates mentioned in the previous section) with dev 2015. We then applied this first CRF model to train and dev to obtain train crf and dev crf. So, these data are labelled with our first CRF model. We got rid of the original labels for train and dev. And, in the end, we trained our final model (always with the same templates) with dev 2015, train crf and dev crf all together. We did the same procedure for the 10 types entities and for no typed data. Our results are deprecision r</context>
</contexts>
<marker>Raymond, Fayolle, 2010</marker>
<rawString>Christian Raymond and Julien Fayolle. 2010. Reconnaissance robuste d’entit´es nomm´ees sur de la parole transcrite automatiquement. In Conf´erence Traitement automatique des langues naturelles, TALN’10, Montr´eal, Qu´ebec, Canada, July. ATALA.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Alan Ritter</author>
<author>Sam Clark</author>
<author>Mausam</author>
<author>Oren Etzioni</author>
</authors>
<title>Named entity recognition in tweets: An experimental study.</title>
<date>2011</date>
<booktitle>In Proceedings of the Conference on Empirical Methods in Natural Language Processing, EMNLP ’11,</booktitle>
<pages>1524--1534</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<location>Stroudsburg, PA, USA.</location>
<contexts>
<context position="1905" citStr="Ritter et al., 2011" startWordPosition="317" endWordPosition="320">nd section. The third section focuses on some experiments with train and dev and gives the obtained results. The fourth section is about the procedure we have used to build our final model, by applying a domain adaptation strategy. In the last section, we discuss some future work for this shared task. 2 Data Analysis Although named entity recognition is a traditional task of natural language processing (NLP) which has given rise to a large body of works for written English (Finkel et al., 2005) or news wires in French (Stern and Sagot, 2010), the same task with Twitter data remains difficult (Ritter et al., 2011). Today wasz Fun cusz anna Came juss for me &lt;3: hahaha Figure 1: An example of tweet This is not only because of the task itself, but also because of the way tweets are written. Figure 1 shows an example of tweet. The correct sentence should be: Today was fun because Anna came just for me &lt;3: hahaha. We can note the following phenomena: • spelling mistakes: wasz (was), cusz (because), juss (just) • confusion of upper/lower cases: Fun (fun), anna (Anna), Came (came) • emoticon: &lt;3 • interjection: hahaha We remark here that the only name has no upper case letters whereas other words have upper c</context>
</contexts>
<marker>Ritter, Clark, Mausam, Etzioni, 2011</marker>
<rawString>Alan Ritter, Sam Clark, Mausam, and Oren Etzioni. 2011. Named entity recognition in tweets: An experimental study. In Proceedings of the Conference on Empirical Methods in Natural Language Processing, EMNLP ’11, pages 1524–1534, Stroudsburg, PA, USA. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Rosa Stern</author>
<author>Benoit Sagot</author>
</authors>
<title>Resources for named entity recognition and resolution in news wires.</title>
<date>2010</date>
<booktitle>In Entity 2010 Workshop at LREC</booktitle>
<contexts>
<context position="1832" citStr="Stern and Sagot, 2010" startWordPosition="305" endWordPosition="308">re chosen according to the state-of-the-art, they are described in the second section. The third section focuses on some experiments with train and dev and gives the obtained results. The fourth section is about the procedure we have used to build our final model, by applying a domain adaptation strategy. In the last section, we discuss some future work for this shared task. 2 Data Analysis Although named entity recognition is a traditional task of natural language processing (NLP) which has given rise to a large body of works for written English (Finkel et al., 2005) or news wires in French (Stern and Sagot, 2010), the same task with Twitter data remains difficult (Ritter et al., 2011). Today wasz Fun cusz anna Came juss for me &lt;3: hahaha Figure 1: An example of tweet This is not only because of the task itself, but also because of the way tweets are written. Figure 1 shows an example of tweet. The correct sentence should be: Today was fun because Anna came just for me &lt;3: hahaha. We can note the following phenomena: • spelling mistakes: wasz (was), cusz (because), juss (just) • confusion of upper/lower cases: Fun (fun), anna (Anna), Came (came) • emoticon: &lt;3 • interjection: hahaha We remark here that</context>
</contexts>
<marker>Stern, Sagot, 2010</marker>
<rawString>Rosa Stern and Benoit Sagot. 2010. Resources for named entity recognition and resolution in news wires. In Entity 2010 Workshop at LREC 2010.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Jun Suzuki</author>
<author>Hideki Isozaki</author>
</authors>
<title>Semi-supervised sequential labeling and segmentation using gigaword scale unlabeled data. In</title>
<date>2008</date>
<booktitle>In ACL.</booktitle>
<contexts>
<context position="3104" citStr="Suzuki and Isozaki, 2008" startWordPosition="528" endWordPosition="531">s other words have upper cases (like ”Fun”, ”Came”). So, it would be difficult for a named entity extractor to correctly detect this person name. 3 CRF Implementation and Features 3.1 CRF Features We used the CRF implementation Wapiti 1.5.0 1 to create our CRF model. The optimization algorithm we chose was rprop+. The features for the tokens are all in unigrams and within a window of size 3 (previous token, current token and next token). The bigrams are only made of labels, characterizing label transitions. Table 1 shows the features we implemented. These templates have been chosen following (Suzuki and Isozaki, 2008), (Lavergne et al., 2010), (Nooralahzadeh et al., 2014) and (Constant et al., 2011) 1https://wapiti.limsi.fr 68 Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 68–71, Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics token value fstUpper shortCap longCap mixCap hasUpper allUpper capType: combination of 6 binary values allLetter singleLetter tokenType: punctuation, 9, x or X hasNumber allNumber isDecimal onePunct allPunct hasPunct longPunct hasQuotation hasAtLeast2periodes finishedByPeriode hasDash lower returnUnicodeVector isEmal isURL isR</context>
</contexts>
<marker>Suzuki, Isozaki, 2008</marker>
<rawString>Jun Suzuki and Hideki Isozaki. 2008. Semi-supervised sequential labeling and segmentation using gigaword scale unlabeled data. In In ACL.</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>