@Book{RANLP:2017,
  editor    = {Ruslan Mitkov  and  Galia Angelova},
  title     = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_}
}

@InProceedings{abualhaija-EtAl:2017:RANLP,
  author    = {Abualhaija, Sallam  and  Tahmasebi, Nina  and  Forin, Diane  and  Zimmermann, Karl-Heinz},
  title     = {Parameter Transfer across Domains for Word Sense Disambiguation},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {1--8},
  abstract  = {Word sense disambiguation is defined as finding the corresponding sense for a
	target word in a given context, which comprises a major step in text
	applications.  Recently, it has been addressed as an optimization problem. 
	The idea behind is to find a sequence of senses that corresponds to the words
	in a given context with a maximum semantic similarity. Metaheuristics like
	simulated annealing and D-Bees provide approximate
	good-enough solutions, but are usually influenced by the starting parameters.
	In this paper, we study the parameter tuning for both algorithms within the
	word sense disambiguation problem. 
	The experiments are conducted on different datasets to cover different
	disambiguation scenarios. 
	We show that D-Bees is robust and less sensitive towards the initial parameters
	compared to simulated annealing, hence, it is sufficient to tune the parameters
	once and reuse them for different datasets, domains or languages.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_001}
}

@InProceedings{aburaed-chiruzzo-saggion:2017:RANLP,
  author    = {AbuRa'ed, Ahmed  and  Chiruzzo, Luis  and  Saggion, Horacio},
  title     = {What Sentence are you Referring to and Why? Identifying Cited Sentences in Scientific Literature},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {9--17},
  abstract  = {In the current context of scientific information overload, text mining tools
	are of paramount importance for researchers who have to read scientific papers
	and assess their value. Current citation networks, which link papers by
	citation relationships (reference and citing paper), are useful to
	quantitatively                          understand the value of a piece of scientific
	work,
	however
	they are limited in that they do not provide information about what specific
	part of the reference paper the citing paper is referring to. This qualitative
	information is very important, for example, in the context of current
	community-based scientific summarization activities. In this paper, and relying
	on an annotated dataset of co-citation sentences, we carry out a number of
	experiments aimed at, given a citation sentence, automatically identify a part
	of a reference paper being cited. Additionally our algorithm predicts the
	specific reason why such reference sentence has been cited out of five possible
	reasons.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_002}
}

@InProceedings{agirrezabal-alegria-hulden:2017:RANLP,
  author    = {Agirrezabal, Manex  and  Alegria, I\~{n}aki  and  Hulden, Mans},
  title     = {A Comparison of Feature-Based and Neural Scansion of Poetry},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {18--23},
  abstract  = {Automatic analysis of poetic rhythm is a challenging task that involves
	linguistics, literature, and computer science. When the language to be analyzed
	is known, rule-based systems or data-driven methods can be used. In this paper,
	we analyze poetic rhythm in English and Spanish. We show that the
	representations of data learned from character-based neural models are more
	informative than the ones from hand-crafted features, and that a
	Bi-LSTM+CRF-model produces state-of-the art accuracy on scansion of poetry in
	two languages. Results also show that the information about whole word
	structure, and not just independent syllables, is highly informative for
	performing scansion.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_003}
}

@InProceedings{ahmadnia-serrano-haffari:2017:RANLP,
  author    = {Ahmadnia, Benyamin  and  Serrano, Javier  and  Haffari, Gholamreza},
  title     = {Persian-Spanish Low-Resource Statistical Machine Translation Through English as Pivot Language},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {24--30},
  abstract  = {This paper is an attempt to exclusively focus on investigating the pivot
	language technique in which a bridging language is utilized to increase the
	quality of the Persian-Spanish low-resource Statistical Machine Translation
	(SMT). In this case, English is used as the bridging language, and the
	Persian-English SMT is combined with the English-Spanish one, where the
	relatively large corpora of each may be used in support of the Persian-Spanish
	pairing. Our results indicate that the pivot language technique outperforms the
	direct SMT processes currently in use between Persian and Spanish. Furthermore,
	we investigate the sentence translation pivot strategy and the phrase
	translation in turn, and demonstrate that, in the context of the
	Persian-Spanish SMT system, the phrase-level pivoting outperforms the
	sentence-level pivoting. Finally we suggest a method called combination model
	in which the standard direct model and the best triangulation pivoting model
	are blended in order to reach a high-quality translation.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_004}
}

@InProceedings{aker-derczynski-bontcheva:2017:RANLP,
  author    = {Aker, Ahmet  and  Derczynski, Leon  and  Bontcheva, Kalina},
  title     = {Simple Open Stance Classification for Rumour Analysis},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {31--39},
  abstract  = {Stance classification determines the attitude, or stance, in a (typically
	short) text. The task has powerful applications, such as the detection of fake
	news or the automatic extraction of attitudes toward entities or events in the
	media. This paper describes a surprisingly simple and efficient classification
	approach to open stance classification in Twitter, for rumour and veracity
	classification. The approach profits from a novel set of automatically
	identifiable problem-specific features, which significantly boost classifier
	accuracy and achieve above state-of-the-art results on recent benchmark
	datasets. This calls into question the value of using complex sophisticated
	models for stance classification without first doing informed feature
	extraction.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_005}
}

@InProceedings{aker-petrak-sabbah:2017:RANLP,
  author    = {Aker, Ahmet  and  Petrak, Johann  and  Sabbah, Firas},
  title     = {An Extensible Multilingual Open Source Lemmatizer},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {40--45},
  abstract  = {We present GATE DictLemmatizer, a multilingual open source lemmatizer for the
	GATE NLP framework that currently supports English, German, Italian, French,
	Dutch, and Spanish, and is easily extensible to other languages. The software
	is freely available under the LGPL license. The lemmatization is based on the
	Helsinki Finite-State Transducer Technology (HFST) and lemma dictionaries
	automatically created from Wiktionary. We evaluate the performance of the
	lemmatizers against TreeTagger, which is only freely available for research
	purposes. Our evaluation  shows that DictLemmatizer achieves similar or even
	better results than TreeTagger for languages where there is support from HFST.
	The performance drops when there is no support from HFST and the entire
	lemmatization process is based on lemma dictionaries. However, the results are
	still satisfactory given the fact that DictLemmatizer isopen-source and can be
	easily extended to other languages. The software for extending the lemmatizer
	by creating word lists from Wiktionary dictionaries is also freely available as
	open-source software.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_006}
}

@InProceedings{albogamy-ramsay:2017:RANLP,
  author    = {Albogamy, Fahad  and  Ramsay, Allan},
  title     = {Universal Dependencies for Arabic Tweets},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {46--51},
  abstract  = {To facilitate cross-lingual studies, there is an increasing interest in
	identifying linguistic universals.  Recently, a new universal scheme was
	designed as a part of universal dependency project. In this paper, we map the
	Arabic tweets dependency treebank (ATDT) to the Universal Dependency
	(UD) scheme to compare it to other language resources and for the purpose of
	cross-lingual studies.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_007}
}

@InProceedings{almansor-alani:2017:RANLP,
  author    = {Almansor, Ebtesam H  and  Al-Ani, Ahmed},
  title     = {Translating Dialectal Arabic as Low Resource Language using Word Embedding},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {52--57},
  abstract  = {A number of machine translation methods have been proposed in recent years to
	deal with the increasingly important problem of automatic translation between
	texts of different languages or languages and their dialects.
	 These methods have produced promising results when applied to some of the
	widely studied languages. Existing translation methods are mainly implemented
	using rule-based and static machine translation approaches.
	 Rule based approaches utilize language translation rules that can either be
	constructed by an expert, which is quite difficult when dealing with dialects,
	or rely on rule construction algorithms, which require very large parallel
	datasets. 
	Statistical approaches also require large parallel datasets to build the
	translation models.
	 However, large parallel datasets do not exist for languages with low
	resources, such as the Arabic language and its dialects. In this paper we
	propose an algorithm that attempts to overcome this limitation, and apply it to
	translate the Egyptian dialect (EGY) to Modern Standard Arabic (MSA).
	 Monolingual corpus was collected for both MSA and EGY and a relatively small
	parallel language pair set was built to train the models. The proposed method
	utilizes Word embedding as it requires monolingual data rather than parallel
	corpus. Both Continuous Bag of Words and Skip-gram were used to build word
	vectors. The proposed method was validated on four different datasets using a
	four-fold cross validation approach.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_008}
}

@InProceedings{almiman-ramsay:2017:RANLP1,
  author    = {Almiman, Ali  and  Ramsay, Allan},
  title     = {Using English Dictionaries to generate Commonsense Knowledge in Natural Language},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {58--63},
  abstract  = {This paper presents an approach to generating common sense knowledge written in
	raw English sentences. Instead of using public contributors to feed this
	source, this system chose to employ expert linguistics decisions by using
	definitions from English dictionaries. Because the definitions in English
	dictionaries are not prepared to be transformed into inference rules, some
	preprocessing steps were taken to turn each relation of word:definition in
	dictionaries into an inference rule in the form left-hand side $\Rightarrow$
	right-hand side. In this paper, we applied this mechanism using two
	dictionaries: The MacMillan Dictionary and WordNet definitions. A random set of
	200 inference rules were extracted equally from the two dictionaries, and then
	we used human judgment as to whether these rules are `True' or not. For the
	MacMillan Dictionary the precision reaches 0.74 with 0.508 recall, and the
	WordNet definitions resulted in 0.73 precision with 0.09 recall.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_009}
}

@InProceedings{almiman-ramsay:2017:RANLP2,
  author    = {Almiman, Ali  and  Ramsay, Allan},
  title     = {A Hybrid System to apply Natural Language Inference over Dependency Trees},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {64--70},
  abstract  = {This paper presents the development of a natural language inference engine that
	benefits from two current standard approaches; i.e., shallow and deep
	approaches. This system combines two non-deterministic algorithms: the
	approximate matching from the shallow approach and a theorem prover from the
	deep approach for handling multi-step inference tasks. The theorem prover is
	customized to accept dependency trees and apply inference rules to these trees.
	The inference rules are automatically generated as syllogistic rules from our
	test data (FraCaS test suite). The theorem prover exploits a non-deterministic
	matching algorithm within a standard backward chain- ing inference engine. We
	employ continuation programming as a way of seamlessly handling the combination
	of these two non-deterministic algorithms. Test- ing the matching algorithm on
	”Generalized quantifiers” and ”adjectives” topics in FraCaS (MacCartney
	and Manning 2007), we achieved an accuracy of 92.8% of the single-premise
	cases. For the multi- steps of inference, we checked the validity of our
	syllogistic rules and then extracted four generic instances that can be applied
	to more than one problem.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_010}
}

@InProceedings{barbu:2017:RANLP,
  author    = {Barbu, Eduard},
  title     = {Ensembles of Classifiers for Cleaning Web Parallel Corpora and Translation Memories},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {71--77},
  abstract  = {The last years witnessed an increasing interest in the automatic methods for
	spotting false translation units in translation memories. This problem presents
	a great interest to industry as there are many translation memories that
	contain errors. A closely related line of research deals with identifying
	sentences that do not align in the parallel corpora mined from the web. The
	task of spotting false translations is modeled as a binary classification
	problem.  It is known that in certain conditions the ensembles of classifiers
	improve over the performance of the individual members. In this paper we
	benchmark the most popular ensemble of classifiers: Majority Voting, Bagging,
	Stacking and Ada Boost at the task of spotting false translation units for
	translation memories and parallel web corpora. We want to know if for this
	specific problem any ensemble technique improves the performance of the
	individual classifiers and if there is a difference between the data in
	translation memories and parallel web corpora with respect to this task.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_011}
}

@InProceedings{basaldella-EtAl:2017:RANLP,
  author    = {Basaldella, Marco  and  Helmy, Muhammad  and  Antolli, Elisa  and  Popescu, Mihai Horia  and  Serra, Giuseppe  and  Tasso, Carlo},
  title     = {Exploiting and Evaluating a Supervised, Multilanguage Keyphrase Extraction pipeline for under-resourced languages},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {78--85},
  abstract  = {This paper evaluates different techniques for building a supervised,
	multilanguage keyphrase extraction pipeline for languages which lack a gold
	standard. 
	Starting from an unsupervised English keyphrase extraction pipeline, we
	implement pipelines for Arabic, Italian, Portuguese, and Romanian, and we build
	test collections for languages which lack one. 
	Then, we add a Machine Learning module trained on a well-known English language
	corpus and we evaluate the performance not only over English but on the other
	languages as well. Finally, we repeat the same evaluation after training the
	pipeline over an Arabic language corpus to check whether using a
	language-specific corpus brings a further improvement in performance. On the
	five languages we analyzed, results show an improvement in performance when
	using a machine learning algorithm, even if such algorithm is not trained and
	tested on the same language.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_012}
}

@InProceedings{bastawisy-elmahdy:2017:RANLP,
  author    = {bastawisy, ahmed  and  Elmahdy, Mohamed},
  title     = {Multi-Lingual Phrase-Based Statistical Machine Translation for Arabic-English},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {86--89},
  abstract  = {In this paper, we implement a multilingual Statistical Machine Translation
	(SMT) system for Arabic-English Translation. Arabic Text can be categorized
	into standard and dialectal Arabic. These two forms of Arabic differ
	significantly. Different mono-lingual and multi-lingual hybrid SMT approaches
	are compared. Mono-lingual systems do always results in better translation
	accuracy in one Arabic form and poor accuracy in the other. Multi-lingual SMT
	models that are trained with pooled parallel MSA/dialectal data result in
	better accuracy. However, since the available parallel MSA data are much larger
	compared to dialectal data, multilingual models are biased to MSA. We propose
	in the work, a multi-lingual combination of different mono-lingual systems
	using an Arabic form classifier. The outcome of the classier directs the system
	to use the appropriate mono-lingual models (standard, dialectal, or mixture).
	Testing the different SMT systems shows that the proposed classifier-based SMT
	system outperforms mono-lingual and data pooled multi-lingual systems.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_013}
}

@InProceedings{benikova-zesch:2017:RANLP,
  author    = {Benikova, Darina  and  Zesch, Torsten},
  title     = {Same same, but different: Compositionality of paraphrase granularity levels},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {90--96},
  abstract  = {Paraphrases exist on different granularity levels, the most frequently used one
	being the sentential level. However, we argue that working on the sentential
	level is not optimal for both machines and humans, and that it would be easier
	and more efficient to work on sub-sentential levels. To prove this, we quantify
	and analyze the difference between paraphrases on both sentence and
	sub-sentence level in order to show the significance of the problem. First
	results on a preliminary dataset seem to confirm our hypotheses.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_014}
}

@InProceedings{bobicev-sokolova:2017:RANLP,
  author    = {Bobicev, Victoria  and  Sokolova, Marina},
  title     = {Inter-Annotator Agreement in Sentiment Analysis: Machine Learning Perspective},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {97--102},
  abstract  = {Manual text annotation is an essential part of Big Text analytics. Although
	annotators work with limited parts of data sets, their results are extrapolated
	by automated text classification and affect the final classification results.
	Reliability of annotations and adequacy of assigned labels are especially
	important in the case of sentiment annotations. In the current study we examine
	inter-annotator agreement in multi-class, multi-label sentiment annotation of
	messages. We used several annotation agreement measures, as well as statistical
	analysis and Machine Learning to assess the resulting annotations.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_015}
}

@InProceedings{borocs-dumitrescu-pipa:2017:RANLP,
  author    = {Boro\c{s}, Tiberiu  and  Dumitrescu, Stefan Daniel  and  Pipa, Sonia},
  title     = {Fast and Accurate Decision Trees for Natural Language Processing Tasks},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {103--110},
  abstract  = {Decision trees have been previously employed in many machine-learning tasks
	such as part-of-speech tagging, lemmatization, morphological-attribute
	resolution, letter-to-sound conversion and statistical-parametric speech
	synthesis. In this paper we introduce an optimized tree-computation algorithm,
	which is based on the original ID3 algorithm. We also introduce a tree-pruning
	method that uses a development set to delete nodes from over-fitted models. The
	later mentioned algorithm also uses a results caching method for speed-up. Our
	algorithm is almost 200 times faster than a naive implementation and yields
	accurate results on our test datasets.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_016}
}

@InProceedings{bossard-rodrigues:2017:RANLP,
  author    = {Bossard, Aur\'{e}lien  and  Rodrigues, Christophe},
  title     = {An Evolutionary Algorithm for Automatic Summarization},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {111--120},
  abstract  = {This paper proposes a novel method to select sentences for automatic
	summarization based on an evolutionary algorithm. The algorithm explores
	candidate summaries space following an objective function computed over ngrams
	probability distributions of the candidate summary and the source documents.
	This method does not consider a summary as a stack of independent sentences but
	as a whole text, and makes use of advances in unsupervised summarization
	evaluation. We compare this sentence extraction method to one of the best
	existing methods which is based on integer linear programming, and show its
	efficiency on three different acknowledged corpora.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_017}
}

@InProceedings{boyanov-EtAl:2017:RANLP,
  author    = {Boyanov, Martin  and  Nakov, Preslav  and  Moschitti, Alessandro  and  Da San Martino, Giovanni  and  Koychev, Ivan},
  title     = {Building Chatbots from Forum Data: Model Selection Using Question Answering Metrics},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {121--129},
  abstract  = {We propose to use question answering (QA) data from Web forums to train
	chat-bots from scratch, i.e., without dialog data.
	First, we extract pairs of question and answer sentences from the typically
	much longer texts of questions and answers in a forum. We then use these
	shorter
	texts to train seq2seq models in a more efficient way. We further improve the
	parameter optimization using a new model selection strategy based on QA
	measures.
	Finally, we propose to use extrinsic evaluation with respect to a QA task as an
	automatic evaluation method for chatbot systems. The evaluation shows that the
	model achieves a MAP of 63.5% on the extrinsic task. Moreover, our manual
	evaluation demonstrates that the model can answer correctly 49.5% of the
	questions when they are similar in style to how questions are asked in the
	forum, and 47.3% of
	the questions, when they are more conversational in style.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_018}
}

@InProceedings{boytcheva-nikolova-angelova:2017:RANLP,
  author    = {Boytcheva, Svetla  and  Nikolova, Ivelina  and  Angelova, Galia},
  title     = {Mining Association Rules from Clinical Narratives},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {130--138},
  abstract  = {Shallow text analysis (Text Mining) uses mainly Information Extraction
	techniques. The low resource languages do not allow application of such
	traditional techniques with sufficient accuracy and recall on big data. In
	contrast, Data Mining approaches provide an opportunity to make deep analysis
	and to discover new knowledge. Frequent pattern mining approaches are used
	mainly for structured information in databases and are a quite challenging task
	in text mining. Unfortunately, most frequent pattern mining approaches do not
	use contextual information for extracted patterns: general patterns are
	extracted regardless of the context. We propose a method that processes raw
	informal texts (from health discussion forums) and formal texts (outpatient
	records) in Bulgarian language. In addition we use some context information and
	small terminological lexicons to generalize extracted frequent patterns. This
	allows to map informal expression of medical terminology to the formal one and
	to generate automatically resources.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_019}
}

@InProceedings{calixto-liu:2017:RANLP,
  author    = {Calixto, Iacer  and  Liu, Qun},
  title     = {Sentence-Level Multilingual Multi-modal Embedding for Natural Language Processing},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {139--148},
  abstract  = {We propose a novel discriminative ranking model that learns embeddings from
	multilingual and multi-modal data, meaning that our model can take advantage of
	images and descriptions in multiple languages to improve embedding quality. To
	that end, we introduce an objective function that uses pairwise ranking adapted
	to the case of three or more input sources. We compare our model against
	different baselines, and evaluate the robustness of our embeddings on
	image--sentence ranking (ISR), semantic textual similarity (STS), and neural
	machine translation (NMT). We find that the additional multilingual signals
	lead to improvements on all three tasks, and we highlight that our model can be
	used to consistently improve the adequacy of translations generated with NMT
	models when re-ranking n-best lists.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_020}
}

@InProceedings{calleja-EtAl:2017:RANLP,
  author    = {Calleja, Pablo  and  Garc\'{i}a Castro, Ra\'{u}l  and  Aguado-de-Cea, Guadalupe  and  G\'{o}mez-P\'{e}rez, Asunci\'{o}n},
  title     = {Role-based model for Named Entity Recognition},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {149--156},
  abstract  = {Named Entity Recognition (NER) poses new challenges in real-world documents in
	which there are entities with different roles according to their purpose or
	meaning. Retrieving all the possible entities in scenarios in which only a
	subset of them based on their role is needed, produces noise on the overall
	precision. This work proposes a NER model that relies on role classification
	models that support recognizing entities with a specific role. The proposed
	model has been implemented in two use cases using Spanish drug Summary of
	Product Characteristics: identification of therapeutic indications and
	identification of adverse reactions. The results show how precision is
	increased using a NER model that is oriented towards a specific role and
	discards entities out of scope.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_021}
}

@InProceedings{canales-EtAl:2017:RANLP,
  author    = {Canales, Lea  and  Daelemans, Walter  and  Boldrini, Ester  and  Martinez-Barco, Patricio},
  title     = {Towards the Improvement of Automatic Emotion Pre-annotation with Polarity and Subjective Information},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {157--163},
  abstract  = {Emotion detection has a high potential positive impact on the benefit of
	business, society, politics or education. Given this, the main objective of our
	research is to contribute to the resolution of one of the most important
	challenges in textual emotion detection: emotional corpora annotation. This
	will be tackled by proposing a semi-automatic methodology. It consists in two
	main phases: (1) an automatic process to pre-annotate the unlabelled sentences
	with a reduced number of emotional categories; and (2) a manual process of
	refinement where human annotators will determine which is the dominant emotion
	between the pre-defined set. Our objective in this paper is to show the
	pre-annotation process, as well as to evaluate the usability of subjective and
	polarity information in this process. The evaluation performed confirms clearly
	the benefits of employing the polarity and subjective information on emotion
	detection and thus endorses the relevance of our approach.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_022}
}

@InProceedings{chen-bangalore:2017:RANLP,
  author    = {Chen, John  and  Bangalore, Srinivas},
  title     = {Underspecification in Natural Language Understanding for Dialog Automation},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {164--170},
  abstract  = {With the increasing number of communication platforms that offer variety of
	ways of connecting two interlocutors, there is a resurgence of chat-based
	dialog systems. These systems, typically known as {\em chatbots} have been
	successfully applied in a range of consumer and enterprise applications. A key
	technology in such chat-bots is robust natural language understanding (NLU)
	which can significantly influence and impact the efficacy of the conversation
	and ultimately the user-experience. While NLU is far from perfect, this paper
	illustrates the role of {\em underspecification} and its impact on successful
	dialog completion.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_023}
}

@InProceedings{chiru-decea:2017:RANLP,
  author    = {Chiru, Costin  and  Decea, Remus},
  title     = {Identification and Classification of the Most Important Moments in Students’ Collaborative Chats},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {171--176},
  abstract  = {In this paper, we present an application for the automatic identification of
	the important moments that might occur during students’ collaborative chats.
	The moments are detected based on the input received from the user, who may
	choose to perform an analysis on the topics that interest him/her. Moreover,
	the application offers various types of suggestive and intuitive graphics that
	aid the user in identification of such moments. There are two main aspects that
	are considered when identifying important moments: the concepts' frequency and
	distribution throughout the conversation and the chat tempo, which is analyzed
	for identifying intensively debated concepts. By the tempo of the chat we
	understand the rate at which the ideas are input by the chat participants,
	expressed by the utterances' timestamps.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_024}
}

@InProceedings{cotik-EtAl:2017:RANLP,
  author    = {Cotik, Viviana  and  Filippo, Dar\'{i}o  and  Roller, Roland  and  Uszkoreit, Hans  and  Xu, Feiyu},
  title     = {Annotation of Entities and Relations in Spanish Radiology Reports},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {177--184},
  abstract  = {Radiology reports express the results of a radiology study and contain
	information about anatomical entities, findings, measures and impressions of
	the medical doctor. The use of information extraction techniques can help
	physicians to access this information in order to understand data and to infer
	further knowledge. 
	Supervised machine learning methods are very popular to address information
	extraction, but are usually domain and language dependent. To train new
	classification models, annotated data is required. Moreover, annotated data is
	also required as an evaluation resource of information extraction algorithms.
	However, one major drawback of processing clinical data is the low availability
	of annotated datasets. For this reason we performed a manual annotation of
	radiology reports written in Spanish. This paper presents the corpus, the
	annotation schema, the annotation guidelines and further insight of the data.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_025}
}

@InProceedings{dakota-kubler:2017:RANLP,
  author    = {Dakota, Daniel  and  K\"{u}bler, Sandra},
  title     = {Towards Replicability in Parsing},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {185--194},
  abstract  = {We investigate parsing replicability across 7 languages  (and 8 treebanks),
	showing that choices concerning the use of grammatical functions in parsing or
	evaluation, the influence of the rare word threshold, as well as choices in
	test sentences and evaluation script options have considerable and often
	unexpected effects on parsing accuracies. All of those choices need to be
	carefully documented if we want to ensure replicability.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_026}
}

@InProceedings{davoodi-kosseim:2017:RANLP,
  author    = {Davoodi, Elnaz  and  Kosseim, Leila},
  title     = {Automatic Identification of AltLexes using Monolingual Parallel Corpora},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {195--200},
  abstract  = {The automatic identification of discourse relations is still a challenging task
	in natural language processing. Discourse connectives, such as since or but,
	are the most informative cues to identify explicit relations; however discourse
	parsers typically use a closed inventory of such connectives. As a result,
	discourse relations signalled by markers outside these inventories (i.e.
	AltLexes) are not detected as effectively. In this paper, we propose a novel
	method to leverage parallel corpora in text simplification and lexical
	resources to automatically identify alternative lexicalizations that signal
	discourse relation. When applied to the Simple Wikipedia and Newsela corpora
	along with WordNet and the PPDB, the method allowed the automatic discovery of
	91 AltLexes.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_027}
}

@InProceedings{dinu-dinu-dumitru:2017:RANLP,
  author    = {Dinu, Anca  and  Dinu, Liviu P.  and  Dumitru, Bogdan},
  title     = {On the stylistic evolution from communism to democracy: Solomon Marcus study case},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {201--207},
  abstract  = {In this article we propose a stylistic analysis of Solomon Marcus’
	non-scientific
	published texts, gathered in six volumes, aiming to uncover some of his
	quantitative and qualitative fingerprints. Moreover,
	we compare and cluster two distinct periods of time in his writing style: 22
	years of communist regime (1967-1989) and 27 years of democracy (1990-2016).
	The distributional analysis of Marcus’ text reveals that the passing from the
	communist regime period to democracy is sharply marked by two complementary
	changes in Marcus’ writing: in the pre-democracy period, the communist norms
	of writing style demanded on the one hand long phrases, long words and
	clich\'{e}s, and on the other hand, a short list of preferred "official" topics;
	in democracy tendency was towards shorten phrases and words while approaching
	a broader area of topics.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_028}
}

@InProceedings{edouard-EtAl:2017:RANLP1,
  author    = {Edouard, Amosse  and  Cabrio, Elena  and  Tonelli, Sara  and  LE-THAN, Nhan},
  title     = {Building timelines of soccer matches from Twitter},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {208--213},
  abstract  = {This demo paper presents a system that builds a timeline with salient actions
	of a soccer game, based on the tweets posted by users. It combines information
	provided by external knowledge bases to enrich the content of tweets and
	applies graph theory to model relations between actions (e.g. goals, penalties)
	and participants of a game (e.g. players, teams). In the demo, a web
	application displays in nearly real-time the actions detected from
	tweets posted by users for a given match of Euro 2016. Our tools are freely
	available at https://bitbucket.org/eamosse/event\_tracking.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_029}
}

@InProceedings{edouard-EtAl:2017:RANLP2,
  author    = {Edouard, Amosse  and  Cabrio, Elena  and  Tonelli, Sara  and  Le-Thanh, Nhan},
  title     = {You'll Never Tweet Alone: Building Sports Match Timelines from Microblog Posts},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {214--221},
  abstract  = {In this paper, we propose an approach to build a timeline with actions in a
	sports game based on tweets. We combine information provided by external
	knowledge bases to enrich the content of the tweets, and apply graph theory to
	model relations between actions and participants in a game. We demonstrate the
	validity of our approach using tweets collected during the EURO 2016
	Championship and evaluate the output against live summaries produced by sports
	channels.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_030}
}

@InProceedings{edouard-EtAl:2017:RANLP3,
  author    = {Edouard, Amosse  and  Cabrio, Elena  and  Tonelli, Sara  and  LE-THANH, Nhan},
  title     = {Graph-based Event Extraction from Twitter},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {222--230},
  abstract  = {Detecting which tweets describe a specific event and clustering them is one
	of the main challenging tasks related to Social Media currently addressed in
	the NLP community. Existing approaches have mainly focused on detecting spikes
	in clusters around specific keywords or Named Entities (NE). However, one of
	the main drawbacks of such approaches is the difficulty in understanding when
	the same keywords describe different events. In this paper, we propose a novel
	approach that exploits NE mentions in tweets and their entity context to create
	a temporal event graph. Then, using simple graph theory techniques and a
	PageRank-like algorithm, we process the event graphs to detect clusters of
	tweets describing the same events. Experiments on two gold standard datasets
	show that our approach achieves state-of-the-art results both in terms of
	evaluation performances and the quality of the detected events.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_031}
}

@InProceedings{fernandez-EtAl:2017:RANLP,
  author    = {Fern\'{a}ndez, Javi  and  Llopis, Fernando  and  Guti\'{e}rrez, Yoan  and  Mart\'{i}nez-Barco, Patricio  and  D\'{i}ez, \'{A}lvaro},
  title     = {Opinion Mining in Social Networks versus Electoral Polls},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {231--237},
  abstract  = {The recent failures of traditional poll models, like the predictions in United
	Kingdom with the Brexit, or in United States presidential elections with the
	victory of Donald Trump, have been noteworthy. With the decline of traditional
	poll models and the growth of the social networks, automatic tools are gaining
	popularity to make predictions in this context. In this paper we present our
	approximation and compare it with a real case: the 2017 French presidential
	election.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_032}
}

@InProceedings{galarreta-melgar-oncevay:2017:RANLP,
  author    = {Galarreta, Ana Paula  and  Melgar, Andr\'{e}s  and  Oncevay, Arturo},
  title     = {Corpus Creation and Initial SMT Experiments between Spanish and Shipibo-konibo},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {238--244},
  abstract  = {In this paper, we present the first attempts to develop a machine translation
	(MT) system between Spanish and Shipibo-konibo (es-shp). There are very few
	digital texts written in Shipibo-konibo and even less bilingual texts that can
	be aligned, hence we had to create a parallel corpus using both bilingual and
	monolingual texts. We will describe how this corpus was made, as well as the
	process we followed to improve the quality of the sentences used to build a
	statistical MT model or SMT. The results obtained surpassed the baseline
	proposed (dictionary based) and made a promising result for further development
	considering the size of corpus used. Finally, it is expected that this MT
	system can be reinforced with the use of additional linguistic rules and
	automatic language processing functions that are being implemented.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_033}
}

@InProceedings{galieva-nevzorova-yakubova:2017:RANLP,
  author    = {Galieva, Alfiya  and  Nevzorova, Olga  and  Yakubova, Dilyara},
  title     = {Russian-Tatar Socio-Political Thesaurus: Methodology, Challenges, the Status of the Project},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {245--252},
  abstract  = {This paper discusses the general methodology and important practical aspects of
	implementing a new bilingual lexical resource -- the Russian-Tatar
	Socio-Political Thesaurus that is being developed on the basis of the Russian
	RuThes thesaurus format as a hierarchy of concepts viewed as units of thought.
	Each concept is linked with a set of language expressions (words and
	collocations) referring to it in texts (text entries). Currently the
	Russian-Tatar Socio-Political Thesaurus includes 6,000 concepts, while new
	concepts and text entries are being constantly added to it.
	The paper outlines main challenges of translating concept names and their text
	entries into Tatar, and describes ways of reflecting the specificity of the
	Tatar lexical-semantic system.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_034}
}

@InProceedings{galitsky-ilvovsky:2017:RANLP,
  author    = {Galitsky, Boris  and  Ilvovsky, Dmitry},
  title     = {On a Chat Bot Finding Answers with Optimal Rhetoric Representation},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {253--259},
  abstract  = {We demo a chat bot with the focus on complex, multi-sentence questions that
	enforce what we call rhetoric agreement of answers with questions. Chat bot
	finds answers which are not only relevant by topic but also match the question
	by style, argumentation patterns, communication means, experience level and
	other attributes. The system achieves rhetoric agreement by learning pairs of
	discourse trees (DTs) for question (Q) and answer (A). We build a library of
	best answer DTs for most types of complex questions. To better recognize a
	valid rhetoric agreement between Q and A, DTs are extended with the labels for
	communicative actions. An algorithm for finding the best DT for an A, given a
	Q, is evaluated.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_035}
}

@InProceedings{gao-huang:2017:RANLP,
  author    = {Gao, Lei  and  Huang, Ruihong},
  title     = {Detecting Online Hate Speech Using Context Aware Models},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {260--266},
  abstract  = {In the wake of a polarizing election, the cyber world is laden with hate
	speech. Context  
	  accompanying a hate speech text is useful for identifying hate speech, which
	however 
	  has been largely overlooked in existing datasets and hate speech detection
	models. In this paper, we provide an annotated corpus of hate speech  
	  with context information well kept. Then we propose two types of hate speech
	detection models that incorporate context information, a logistic regression
	model with context features and a neural network model with learning components
	for context. Our evaluation shows that both models outperform a strong baseline
	by around 3% to 4% in F1 score and combining these two models further improve
	the performance by another 7% in F1 score.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_036}
}

@InProceedings{gencheva-EtAl:2017:RANLP,
  author    = {Gencheva, Pepa  and  Nakov, Preslav  and  M\`{a}rquez, Llu\'{i}s  and  Barr\'{o}n-Cede\~{n}o, Alberto  and  Koychev, Ivan},
  title     = {A Context-Aware Approach for Detecting Worth-Checking Claims in Political Debates},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {267--276},
  abstract  = {In the context of investigative journalism, we address the problem of
	automatically identifying which claims in a given document are most worthy and
	should be prioritized for fact-checking. Despite its importance, this is a
	relatively understudied problem. Thus, we create a new corpus of political
	debates, containing statements that have been fact-checked by nine reputable
	sources, and we train machine learning models to predict which claims should be
	prioritized for fact-checking, i.e., we model the problem as a ranking task. 
	Unlike previous work, which has looked primarily at sentences in isolation, in
	this paper we focus on a rich input representation modeling the context:
	relationship between the target statement and the larger context of the debate,
	interaction between the opponents, and reaction by the moderator and by the
	public. Our experiments show state-of-the-art results, outperforming a strong
	rivaling system by a margin, while also confirming the importance of the
	contextual information.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_037}
}

@InProceedings{gromann-declerck:2017:RANLP,
  author    = {Gromann, Dagmar  and  Declerck, Thierry},
  title     = {Hashtag Processing for Enhanced Clustering of Tweets},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {277--283},
  abstract  = {Rich data provided by tweets have beenanalyzed, clustered, and explored in a
	variety of studies. Typically those studies focus on named entity recognition,
	entity linking, and entity disambiguation or clustering. Tweets and hashtags
	are generally analyzed on sentential or word level but not on a compositional
	level of concatenated words. We propose an approach for a closer analysis of
	compounds in hashtags, and in the long run also of other types of text
	sequences in tweets, in order to enhance the clustering of such text documents.
	Hashtags have been used before as primary topic indicators to cluster tweets,
	however, their segmentation and its effect on clustering results have not been
	investigated to the best of our knowledge. Our results with a standard dataset
	from the Text
	REtrieval Conference (TREC) show that segmented and harmonized hashtags
	positively impact effective clustering.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_038}
}

@InProceedings{guillen-gutierrez-munoz:2017:RANLP,
  author    = {Guill\'{e}n, Antonio  and  Guti\'{e}rrez, Yoan  and  Mu\~{n}oz, Rafael},
  title     = {Natural Language Processing Technologies for Document Profiling},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {284--290},
  abstract  = {Nowadays, search for documents on the Internet is becoming increasingly
	difficult. The reason is the amount of content published by users (articles,
	comments, blogs, reviews). How to facilitate that the users can find their
	required documents? What would be necessary to provide useful document
	meta-data for supporting search engines? In this article, we present a study of
	some Natural Language Processing (NLP) technologies that can be useful for
	facilitating the proper identification of documents according to the user
	needs. For this purpose, it is designed a document profile that will be able to
	represent semantic meta-data extracted from documents by using NLP
	technologies. The research is basically focused on the study of different NLP
	technologies in order to support the creation our novel document profile
	proposal from semantic perspectives.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_039}
}

@InProceedings{hazem-boussaha-hernandez:2017:RANLP,
  author    = {Hazem, Amir  and  Boussaha, Basma El Amel  and  Hernandez, Nicolas},
  title     = {MappSent: a Textual Mapping Approach for Question-to-Question Similarity},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {291--300},
  abstract  = {Since the advent of word embedding methods, the representation of longer pieces
	of texts such as sentences and paragraphs is gaining more and more interest,
	especially for textual similarity tasks. \citet{Mikolov2013} have demonstrated
	that words and phrases exhibit linear structures that allow to meaningfully
	combine words by an element-wise addition of their vector representations.   
	Recently, \citet{arora2017} have shown that removing the projections of the
	weighted average sum of word embedding vectors on their first principal
	components, outperforms sophisticated supervised methods including RNN's and
	LSTM's. Inspired by \citet{Mikolov2013,arora2017} findings and by a bilingual
	word mapping technique presented in \citet{artetxe2016learning}, we introduce
	MappSent, a novel approach for textual similarity. Based on a linear sentence
	embedding representation, its principle is to build a matrix that maps
	sentences in a joint-subspace where similar sets of sentences are pushed
	closer. We evaluate our approach on  the SemEval 2016/2017 question-to-question
	similarity task and show that overall MappSent                                     
	achieves
	competitive
	results
	and outperforms in most cases state-of-art methods.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_040}
}

@InProceedings{hercig-lenc:2017:RANLP,
  author    = {Hercig, Tom\'{a}\v{s}  and  Lenc, Ladislav},
  title     = {The Impact of Figurative Language on Sentiment Analysis},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {301--308},
  abstract  = {Figurative language such as irony, sarcasm, and metaphor is considered a
	significant challenge in sentiment analysis. These figurative devices can
	sculpt the affect of an utterance and test the limits of sentiment analysis of
	supposedly literal texts.  
	We explore the effect of figurative language on sentiment analysis. We
	incorporate the figurative language indicators into the sentiment analysis
	process and compare the results with and without the additional information
	about them. We evaluate on the SemEval-2015 Task 11 data and outperform the
	first team with our convolutional neural network model and additional training
	data in terms of mean squared error and we follow closely behind the first
	place in terms of cosine similarity.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_041}
}

@InProceedings{hooda-kosseim:2017:RANLP,
  author    = {Hooda, Sohail  and  Kosseim, Leila},
  title     = {Argument Labeling of Explicit Discourse Relations using LSTM Neural Networks},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {309--315},
  abstract  = {Argument labeling of explicit discourse relations is a challenging task. The
	state of the art systems achieve slightly above 55% F-measure but require
	hand-crafted features. In this paper, we propose a Long Short Term Memory
	(LSTM) based model for argument labeling. We experimented with multiple
	configurations of our model. Using the PDTB dataset, our best model achieved an
	F1 measure of 23.05% without any feature engineering. This is significantly
	higher than the 20.52% achieved by the state of the art RNN approach, but
	significantly lower than the feature based state of the art systems. On the
	other hand, because our approach learns only from the raw dataset, it is more
	widely applicable to multiple textual genres and languages.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_042}
}

@InProceedings{hu-dakota-kubler:2017:RANLP,
  author    = {Hu, Hai  and  Dakota, Daniel  and  K\"{u}bler, Sandra},
  title     = {Non-Deterministic Segmentation for Chinese Lattice Parsing},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {316--324},
  abstract  = {Parsing Chinese critically depends on correct word segmentation for the parser
	since incorrect segmentation inevitably causes              incorrect parses. We
	investigate a pipeline approach to segmentation and parsing using word lattices
	as parser input. We compare CRF-based and  lexicon-based approaches to word
	segmentation. Our results show that the lattice parser is capable of selecting
	the correction segmentation from thousands of options, thus drastically
	reducing the number of unparsed sentence. Lexicon-based parsing models have a
	better coverage than the CRF-based approach, but the many options are more
	difficult to handle. We reach our best result by using a lexicon from the
	n-best CRF analyses,  combined with highly probable words.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_043}
}

@InProceedings{kanishcheva-bobicev:2017:RANLP,
  author    = {Kanishcheva, Olga  and  Bobicev, Victoria},
  title     = {Good News vs. Bad News: What are they talking about?},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {325--333},
  abstract  = {Today’s massive news streams demand the automate analysis which is provided
	by various online news explorers. However, most of them do not provide
	sentiment analysis. The main problem of sentiment analysis of news is the
	differences between the writers and readers attitudes to the news text. News
	can be good or bad but have to be delivered in neutral words as pure facts.
	Although there are applications for sentiment analysis of news, the task of
	news analysis is still a very actual problem because the latest news impacts
	people’s lives daily.
	In this paper, we explored the problem of sentiment analysis for Ukrainian and
	Russian news, developed a corpus of Ukrainian and Russian news and annotated
	each text using one of three categories: positive, negative and neutral. Each
	text was marked by at least three independent annotators via the web interface,
	the inter-annotator agreement was analyzed and the final label for each text
	was computed. These texts were used in the machine learning experiments.
	Further, we investigated what kinds of named entities such as Locations,
	Organizations, Persons are perceived as good or bad by the readers and which of
	them were the cause for text annotation ambiguity.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_044}
}

@InProceedings{karadzhov-EtAl:2017:RANLP1,
  author    = {Karadzhov, Georgi  and  Gencheva, Pepa  and  Nakov, Preslav  and  Koychev, Ivan},
  title     = {We Built a Fake News / Click Bait Filter: What Happened Next Will Blow Your Mind!},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {334--343},
  abstract  = {It is completely amazing! Fake news and “click baits” have totally invaded
	the cyberspace. Let us face it: everybody hates them for three simple reasons.
	Reason \#2 will absolutely amaze you. What these can achieve at the time of
	election will completely blow your mind! Now, we all agree, this cannot go on,
	you know, somebody has to stop it. So, we did this research, and trust us, it
	is totally great research, it really is! Make no mistake. This is the best
	research ever!                                                                       
	     
	Seriously, come have a look, we have it all: neural networks, attention
	mechanism, sentiment lexicons, author profiling, you name it. Lexical features,
	semantic features, we absolutely have it all. And we have totally tested it,
	trust us! We have results, and numbers, really big numbers. The best numbers
	ever! Oh, and analysis, absolutely top notch analysis. Interested? Come read
	the shocking truth about fake news and clickbait in the Bulgarian cyberspace.
	You won’t believe what we have found!},
  url       = {https://doi.org/10.26615/978-954-452-049-6_045}
}

@InProceedings{karadzhov-EtAl:2017:RANLP2,
  author    = {Karadzhov, Georgi  and  Nakov, Preslav  and  M\`{a}rquez, Llu\'{i}s  and  Barr\'{o}n-Cede\~{n}o, Alberto  and  Koychev, Ivan},
  title     = {Fully Automated Fact Checking Using External Sources},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {344--353},
  abstract  = {Given the constantly growing proliferation of false claims online in recent
	years, there has been also a growing research interest in automatically
	distinguishing false rumors from factually true claims. Here, we propose a
	general-purpose framework for fully-automatic fact checking using external
	sources, tapping the potential of the entire Web as a knowledge source to
	confirm or reject a claim. Our framework uses a deep neural network with LSTM
	text encoding to combine semantic kernels with task-specific embeddings that
	encode a claim together with pieces of potentially relevant text fragments from
	the Web, taking the source reliability into account. The evaluation results
	show good performance on two different tasks and datasets: (i) rumor detection
	and (ii) fact checking of the answers to a question in community question
	answering forums.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_046}
}

@InProceedings{kaushik-EtAl:2017:RANLP,
  author    = {Kaushik, Divyansh  and  Gupta, Shashank  and  Raju, Chakradhar  and  Dias, Reuben  and  Ghosh, Sanjib},
  title     = {Making Travel Smarter: Extracting Travel Information From Email Itineraries Using Named Entity Recognition},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {354--362},
  abstract  = {The purpose of this research is to address the problem of extracting
	information from travel itineraries and discuss the challenges faced in the
	process. Businessto- customer emails like booking confirmations and e-tickets
	are usually machine generated by filling slots in pre-defined
	templates which improve the presentation of such emails but also make the
	emails more complex in structure. Extracting the relevant information from
	these emails would let users track their journeys and important updates on
	applications installed on their devices to give them a consolidated over view
	of their itineraries and also save valuable time. We investigate the use of an
	HMM-based named entity recognizer on such emails which we will use to label and
	extract relevant entities. NER in such emails is challenging as these
	itineraries offer less useful contextual information. We also propose a rich
	set of features which are integrated into the model and are specific to our
	domain. The result from our model is a list of lists containing the relevant
	information extracted from ones itinerary.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_047}
}

@InProceedings{kkedzia-piasecki-janz:2017:RANLP,
  author    = {K\k{e}dzia, Pawe{\l}  and  Piasecki, Maciej  and  Janz, Arkadiusz},
  title     = {Graph-Based Approach to Recognizing CST Relations in Polish Texts},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {363--371},
  abstract  = {This paper presents an supervised approach to the recognition of Cross-document
	Structure Theory (CST) relations in Polish texts.  In the proposed, graph-based
	representation is constructed for sentences. Graphs are built on the basis of
	lexicalised syntactic-semantic relation extracted from text. Similarity between
	sentences is calculated from graph, and the similarity values are input to
	classifiers trained by Logistic Model Tree. Several different configurations of
	graph, as well as graph similarity methods were analysed for this tasks. The
	approach was evaluated on a large open corpus annotated manually with 17 types
	of selected CST relations. The configuration of experiments was similar to
	those known from SEMEVAL and we obtained very promising results.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_048}
}

@InProceedings{kobus-crego-senellart:2017:RANLP,
  author    = {KOBUS, Catherine  and  Crego, Josep  and  Senellart, Jean},
  title     = {Domain Control for Neural Machine Translation},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {372--378},
  abstract  = {Machine translation systems are very sensitive to the domains they were trained
	on. Several domain adaptation techniques have already been deeply studied. We
	propose a
	new technique for neural machine translation (NMT) that we call domain control
	which is performed at runtime using a unique neural network covering multiple
	domains. The presented approach shows quality improvements when compared to
	dedicated domains translating on any of the covered domains and even on
	out-of-domain data. In addition, model parameters do not need to be
	re-estimated for each domain, making this effective to real use cases.
	Evaluation is carried out on English-to-French translation for two different
	testing scenarios. We first consider the case where an end-user performs
	translations on a known domain. Secondly, we consider the scenario where the
	domain is not known and predicted at the sentence level before translating.
	Results show consistent accuracy improvements for both conditions.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_049}
}

@InProceedings{kocmi-bojar:2017:RANLP,
  author    = {Kocmi, Tom  and  Bojar, Ond\v{r}ej},
  title     = {Curriculum Learning and Minibatch Bucketing in Neural Machine Translation},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {379--386},
  abstract  = {We examine the effects of particular orderings of sentence pairs on the on-line
	training of neural machine translation (NMT). We focus on two types of such
	orderings: (1) ensuring that each minibatch contains sentences similar in some
	aspect and (2) gradual inclusion of some sentence types as the training
	progresses (so called ``curriculum learning''). In our English-to-Czech
	experiments, the internal homogeneity of minibatches has no effect on the
	training but some of our ``curricula'' achieve a small improvement over the
	baseline.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_050}
}

@InProceedings{kocon-marcinczuk:2017:RANLP,
  author    = {Koco\'{n}, Jan  and  Marci\'{n}czuk, Micha{\l}},
  title     = {Improved Recognition and Normalisation of Polish Temporal Expressions},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {387--393},
  abstract  = {In this article we present the result of the recent research in the recognition
	and normalisation of Polish temporal expressions. The temporal information
	extracted from the text plays major role in many information extraction
	systems, like question answering, event recognition or discourse analysis. We
	proposed a new method for the temporal expressions normalisation, called
	Cascade of Partial Rules. Here we describe results achieved by updated version
	of Liner2 machine learning system.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_051}
}

@InProceedings{konkol:2017:RANLP,
  author    = {Konkol, Michal},
  title     = {Joint Unsupervised Learning of Semantic Representation of Words and Roles in Dependency Trees},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {394--400},
  abstract  = {In this paper, we introduce WoRel, a model that jointly learns word embeddings
	and a semantic representation of word relations. The model learns from plain
	text sentences and their dependency parse trees. The word embeddings produced
	by WoRel outperform Skip-Gram and GloVe in word similarity and syntactical word
	analogy tasks and have comparable results on word relatedness and semantic word
	analogy tasks. We show that the semantic representation of relations enables us
	to express the meaning of phrases and is a promising research direction for
	semantics at the sentence level.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_052}
}

@InProceedings{konopik-pravzak-steinberger:2017:RANLP,
  author    = {Konopik, Miloslav  and  Pra\v{z}\'{a}k, Ond\v{r}ej  and  Steinberger, David},
  title     = {Czech Dataset for Semantic Similarity and Relatedness},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {401--406},
  abstract  = {This paper introduces a Czech dataset for semantic similarity and semantic
	relatedness. The dataset contains word pairs with hand annotated scores that
	indicate the semantic similarity and semantic relatedness of the words. The
	dataset contains 953 word pairs compiled from 9 different sources. It contains
	words and their contexts taken from real text corpora including extra examples
	when the words are ambiguous. The dataset is annotated by 5 independent
	annotators. The average Spearman correlation coefficient of the annotation
	agreement is $r = 0.81$. We provide reference evaluation experiments with
	several methods for computing semantic similarity and relatedness.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_053}
}

@InProceedings{laali-kosseim:2017:RANLP,
  author    = {Laali, Majid  and  Kosseim, Leila},
  title     = {Improving Discourse Relation Projection to Build Discourse Annotated Corpora},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {407--416},
  abstract  = {The naive approach to annotation projection is not effective to project
	discourse annotations from one language to another because implicit relations
	are often changed to explicit ones and vice-versa in the translation. In this
	paper, we propose a novel approach based on the intersection between
	statistical word-alignment models to identify unsupported discourse
	annotations. This approach identified 65% of the unsupported annotations in the
	English-French parallel sentences from Europarl. By filtering out these
	unsupported annotations, we induced the first PDTB-style discourse annotated
	corpus for French from Europarl. We then used this corpus to train a classifier
	to identify the discourse-usage of French discourse connectives and show a 15%
	improvement of F1-score compared to the classifier trained on the non-filtered
	annotations.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_054}
}

@InProceedings{lafourcade-lebrun:2017:RANLP,
  author    = {Lafourcade, Mathieu  and  Le Brun, Nathalie},
  title     = {Extracting semantic relations via the combination of inferences, schemas and cooccurrences},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {417--423},
  abstract  = {Extracting semantic relations from texts is a good way to build and supply a
	knowledge base, an indispensable resource for text analysis. We propose and
	evaluate the combination of three ways of producing lexical-semantic relations.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_055}
}

@InProceedings{lafourcade-joubert-lebrun:2017:RANLP,
  author    = {Lafourcade, Mathieu  and  Joubert, Alain  and  Le Brun, Nathalie},
  title     = {If mice were reptiles, then reptiles could be mammals or How to detect errors in the JeuxDeMots lexical network?},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {424--430},
  abstract  = {Correcting errors in a data set is a critical issue. This task can be either
	hand-made by experts, or by crowdsourcing methods, or automatically done using
	algorithms. Although the rate of errors present in the JeuxDeMots network is
	rather low, it is important to reduce it. We present here automatic methods for
	detecting potential secondary errors that would result from automatic inference
	mechanisms when they rely on an initial error manually detected. Encouraging
	results also invite us to consider strategies that would automatically detect
	"erroneous" initial relations, which could lead to the automatic detection of
	the majority of errors in the network.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_056}
}

@InProceedings{lenc-kral:2017:RANLP,
  author    = {Lenc, Ladislav  and  Kral, Pavel},
  title     = {Word Embeddings for Multi-label Document Classification},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {431--437},
  abstract  = {In this paper, we analyze and evaluate word embeddings for representation of
	longer texts in the multi-label classification scenario. The embeddings are
	used in three convolutional neural network topologies. The experiments are
	realized on the Czech \v{C}TK and English Reuters-21578 standard corpora. We
	compare the results
	of word2vec static and trainable embeddings with randomly initialized word
	vectors. We conclude that initialization does not play an important role for
	classification. However, learning of word vectors is crucial to obtain good
	results.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_057}
}

@InProceedings{li-dickinson:2017:RANLP,
  author    = {Li, Wen  and  Dickinson, Markus},
  title     = {Gender Prediction for Chinese Social Media Data},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {438--445},
  abstract  = {Social media provides users a platform to publish messages and socialize with
	others, and microblogs have gained more users than ever in recent years. With
	such usage, user profiling is a popular task in computational linguistics and
	text mining. Different approaches have been used to predict users’ gender,
	age, and other information, but most of this work has been done on English and
	other Western languages. The goal of this project is to predict the gender of
	users based on their posts on Weibo, a Chinese micro-blogging platform. Given
	issues in Chinese word segmentation, we explore character and word n-grams as
	features for this task, as well as using character and word embeddings for
	classification. Given how the data is extracted, we approach the task on a per-
	post basis, and we show the difficulties of the task for both humans and
	computers. Nonetheless, we present encouraging results and point to future
	improvements.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_058}
}

@InProceedings{liao-xie:2017:RANLP,
  author    = {Liao, Zhihua  and  Xie, Yan},
  title     = {A Statistical Machine Translation Model with Forest-to-Tree Algorithm for Semantic Parsing},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {446--451},
  abstract  = {In this paper, we propose a novel supervised model for parsing natural language
	sentences into their formal semantic representations.  This model treats
	sentence-to-lambda-logical expression conversion within the framework of the
	statistical machine translation with forest-to-tree algorithm. To make this
	work, we transform the lambda-logical expression structure into a form suitable
	for the mechanics of statistical machine translation and useful for modeling.
	We show that our model is able to yield new state-of-the-art results on both
	standard datasets with simple features.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_059}
}

@InProceedings{londhe-srihari:2017:RANLP,
  author    = {Londhe, Nikhil  and  Srihari, Rohini},
  title     = {Summarizing World Speak : A Preliminary Graph Based Approach},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {452--458},
  abstract  = {Social media platforms play a crucial role in piecing together global news
	stories via their corresponding online discussions.  Thus, in this work, we
	introduce the problem of automatically summarizing massively multilingual
	microblog text streams. We discuss the challenges involved in both generating
	summaries as well as evaluating them. We introduce a simple word graph based
	approach that utilizes node neighborhoods to identify keyphrases and thus in
	turn, pick summary candidates. We also demonstrate the effectiveness of our
	method in generating precise summaries as compared to other popular techniques.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_060}
}

@InProceedings{loukachevitch-gerasimova:2017:RANLP,
  author    = {Loukachevitch, Natalia  and  Gerasimova, Anastasia},
  title     = {Human Associations Help to Detect Conventionalized Multiword Expressions},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {459--466},
  abstract  = {In this paper we show that if we want to obtain human evidence about
	conventionalization of some phrases, we should ask native speakers about
	associations they have to a given phrase and its component words. We have shown
	that if component words of a phrase have each other as frequent associations,
	then this phrase can be considered as conventionalized. Another type of
	conventionalized phrases can be revealed using two factors: low entropy of
	phrase associations and low intersection of component word and phrase
	associations. The association experiments were performed for the Russian
	language.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_061}
}

@InProceedings{malmasi-zampieri:2017:RANLP,
  author    = {Malmasi, Shervin  and  Zampieri, Marcos},
  title     = {Detecting Hate Speech in Social Media},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {467--472},
  abstract  = {In this paper we examine methods to detect hate speech in social media, while
	distinguishing this from general profanity. We aim to establish lexical
	baselines for this task by applying supervised classification methods using a
	recently released dataset annotated for this purpose. As features, our system
	uses character n-grams, word n-grams and word skip-grams. We obtain results of
	78% accuracy in identifying posts across three classes. Results demonstrate
	that the main challenge lies in discriminating profanity and hate speech from
	each other. A number of directions for future work are discussed.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_062}
}

@InProceedings{marcinczuk-oleksy-kocon:2017:RANLP,
  author    = {Marci\'{n}czuk, Micha{\l}  and  Oleksy, Marcin  and  Koco\'{n}, Jan},
  title     = {Inforex — a collaborative system for text corpora annotation and analysis},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {473--482},
  abstract  = {We report a first major upgrade of Inforex — a web-based system for
	qualitative and collaborative text corpora annotation and analysis. Inforex is
	a part of Polish CLARIN infrastructure. It is integrated with a digital
	repository for storing and publishing language resources and allows to
	visualize, browse and annotate text corpora stored in the repository. As a
	result of a series of workshops for researches from humanities and social
	sciences fields we improved the graphical interface to make the system more
	friendly and readable for non-experienced users. We also implemented a new
	functionality for gold standard annotation which includes private annotations
	and annotation agreement by a super-annotator.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_063}
}

@InProceedings{marcinczuk:2017:RANLP,
  author    = {Marci\'{n}czuk, Micha{\l}},
  title     = {Lemmatization of Multi-word Common Noun Phrases and Named Entities in Polish},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {483--491},
  abstract  = {In the paper we present a tool for lemmatization of multi-word common noun
	phrases and named entities for Polish called LemmaPL. The tool is based on a
	set of manually crafted rules and heuristics utilizing a set of dictionaries
	(including morphological, named entities and inflection patterns). The accuracy
	of lemmatization obtained by the tool reached 97.99% on a dataset with
	multi-word common noun phrases and 86.17% for case-sensitive evaluation on a
	dataset with named entities.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_064}
}

@InProceedings{mi-EtAl:2017:RANLP,
  author    = {Mi, Chenggang  and  Yang, Yating  and  Dong, Rui  and  Zhou, Xi  and  Wang, Lei  and  Li, Xiao  and  Jiang, Tonghai},
  title     = {Log-linear Models for Uyghur Segmentation in Spoken Language Translation},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {492--500},
  abstract  = {To alleviate data sparsity in spoken Uyghur machine translation, we proposed
	a log-linear based morphological segmentation approach. Instead of learning
	model only from monolingual annotated corpus, this approach optimizes Uyghur
	segmentation for spoken translation based on both bilingual and monolingual
	corpus. Our approach relies on several features such as traditional conditional
	random field (CRF) feature, bilingual word alignment feature and monolingual
	suffixword co-occurrence feature. Experimental results shown that our proposed
	segmentation model for Uyghur spoken translation achieved 1.6 BLEU score
	improvements compared with the state-of-the-art baseline.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_065}
}

@InProceedings{mitrofan:2017:RANLP,
  author    = {Mitrofan, Maria},
  title     = {Bootstrapping a Romanian Corpus for Medical Named Entity Recognition},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {501--509},
  abstract  = {Named Entity Recognition (NER) is an important component of natural language
	processing (NLP), with applicability in biomedical domain, enabling
	knowledge-discovery from medical texts. Due to the fact that for the Romanian
	language there are only a few linguistic resources specific to the biomedical
	domain, it was created a sub-corpus specific to this domain.
	In this paper we present a newly developed Romanian sub-corpus for
	medical-domain NER, which is a valuable asset for the field of biomedical text
	processing. We provide a description of the sub-corpus, informative statistics
	about data-composition and we evaluate an automatic NER tool on the newly
	created resource.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_066}
}

@InProceedings{moreno-romaferri-moredapozo:2017:RANLP,
  author    = {Moreno, Isabel  and  Rom\'{a}-Ferri, Maria Teresa  and  Moreda Pozo, Paloma},
  title     = {A Domain and Language Independent Named Entity Classification Approach Based on Profiles and Local Information},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {510--518},
  abstract  = {This paper presents a Named Entity Classification system, which employs machine
	learning. Our methodology employs local entity information and profiles as
	feature set. All features are generated in an unsupervised manner. It is tested
	on two different data sets: (i) DrugSemantics Spanish corpus (Overall F1 =
	74.92), whose results  are in-line with the state of the art without employing
	external domain-specific resources. And, (ii) English CONLL2003 dataset
	(Overall F1 = 81.40), although our results are lower than
	previous work, these are reached without external knowledge or complex
	linguistic analysis. Last, using the same configuration for the two corpora,
	the difference of overall F1 is only 6.48 points (DrugSemantics = 74.92
	versus CoNLL2003 = 81.40). Thus, this result supports our hypothesis that our
	approach is language and domain independent and does not require any external
	knowledge or complex linguistic analysis.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_067}
}

@InProceedings{mukherjee-kubler:2017:RANLP,
  author    = {Mukherjee, Atreyee  and  K\"{u}bler, Sandra},
  title     = {Similarity Based Genre Identification for POS Tagging Experts \& Dependency Parsing},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {519--526},
  abstract  = {POS tagging and dependency parsing achieve good results for homogeneous
	datasets. However, these tasks are much more difficult on heterogeneous
	datasets. In (Mukherjee et al. 2016, 2017), we address this issue by creating
	genre experts for both POS tagging and parsing. We use topic modeling to 
	automatically separate training and test data into genres and to create
	annotation experts per genre by training separate models for each topic.
	However, this approach assumes that topic modeling is performed jointly on
	training and test sentences each time a new test sentence is encountered. We
	extend this work by assigning new test sentences to their genre expert by using
	similarity metrics. We investigate three different types of methods: 1) based
	on words highly associated with a genre by the topic modeler, 2) using a
	k-nearest neighbor classification approach, and  3) using perplexity to
	determine the closest topic. The results show that the choice of similarity
	metric has an effect on results and that we can reach  comparable accuracies to
	the joint topic modeling in POS tagging and dependency parsing, thus providing
	a viable and efficient approach to POS tagging and parsing a sentence by its
	genre expert.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_068}
}

@InProceedings{naderi-hirst:2017:RANLP1,
  author    = {Naderi, Nona  and  Hirst, Graeme},
  title     = {Recognizing Reputation Defence Strategies in Critical Political Exchanges},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {527--535},
  abstract  = {We propose a new task of automatically detecting reputation defence strategies
	in
	the field of computational argumentation. We cast the problem as relation
	classification, where given a pair of reputation threat and reputation defence,
	we determine the reputation defence strategy. We annotate a dataset of
	parliamentary questions and answers with reputation defence strategies. We then
	propose a model based on supervised learning to address the detection of these
	strategies, and report promising experimental results.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_069}
}

@InProceedings{naderi-hirst:2017:RANLP2,
  author    = {Naderi, Nona  and  Hirst, Graeme},
  title     = {Classifying Frames at the Sentence Level in News Articles},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {536--542},
  abstract  = {Previous approaches to generic frame classification analyze frames at the
	document level. Here, we propose a supervised based approach based on deep
	neural networks and distributional representations for classifying frames at
	the sentence level in news articles. We conduct our experiments on the publicly
	available Media Frames Corpus compiled from the U.S. Newspapers. Using (B)LSTMs
	and GRU networks to represent the meaning of frames, we demonstrate that our
	approach yields at least 14-point improvement over several baseline methods.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_070}
}

@InProceedings{nakov-vogel:2017:RANLP,
  author    = {Nakov, Preslav  and  Vogel, Stephan},
  title     = {Robust Tuning Datasets for Statistical Machine Translation},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {543--550},
  abstract  = {We explore the idea of automatically crafting a tuning dataset for Statistical
	Machine Translation (SMT) that makes the hyper-parameters of the SMT system
	more robust with respect to some specific deficiencies of the parameter tuning
	algorithms. This is an under-explored research direction, which can allow
	better parameter tuning.  In this paper, we achieve this goal by selecting a
	subset of the available sentence pairs, which are more suitable for specific
	combinations of optimizers, objective functions, and evaluation measures. We
	demonstrate the potential of the idea with the pairwise ranking optimization
	(PRO) optimizer, which is known to yield too short translations. We show that
	the learning problem can be alleviated by tuning on a subset of the development
	set, selected based on sentence length. In particular, using the longest 50% of
	the tuning sentences, we achieve two-fold tuning speedup, and improvements in
	BLEU score that rival those of alternatives, which fix BLEU+1's smoothing
	instead.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_071}
}

@InProceedings{nakov-EtAl:2017:RANLP,
  author    = {Nakov, Preslav  and  Mihaylova, Tsvetomila  and  M\`{a}rquez, Llu\'{i}s  and  Shiroya, Yashkumar  and  Koychev, Ivan},
  title     = {Do Not Trust the Trolls: Predicting Credibility in Community Question Answering Forums},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {551--560},
  abstract  = {We address information credibility in community forums, in a setting in which
	the credibility of an answer posted in a question thread by a particular user
	has to be predicted. First, we motivate the problem and we create a publicly
	available annotated English corpus by crowdsourcing. Second, we propose a large
	set of features to predict the credibility of the answers. The features model
	the user, the answer, the question, the thread as a whole, and the interaction
	between them. Our experiments with ranking SVMs show that the credibility
	labels can be predicted with high performance according to several standard IR
	ranking metrics, thus supporting the potential usage of this layer of
	credibility information in practical applications. The features modeling the
	profile of the user (in particular trollness) turn out to be most important,
	but embedding features modeling the answer and the similarity between the
	question and the answer are also very relevant. Overall, half of the gap
	between the baseline performance and the perfect classifier can be covered
	using the proposed features.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_072}
}

@InProceedings{osenova-simov:2017:RANLP,
  author    = {Osenova, Petya  and  Simov, Kiril},
  title     = {Bulgarian-English and English-Bulgarian Machine Translation: System Design and Evaluation},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {561--568},
  abstract  = {The paper presents a deep factored machine translation (MT) system between
	English and Bulgarian languages in both directions. The MT system is hybrid. It
	consists of three main steps: (1) the source-language text is linguistically
	annotated, (2) it is translated to the target language with the Moses system,
	and (3) translation is post-processed with the help of the transferred
	linguistic annotation from the source text. Besides automatic evaluation we
	performed manual evaluation over a domain test suite of sentences demonstrating
	certain phenomena like imperatives, questions, etc.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_073}
}

@InProceedings{paul-das:2017:RANLP,
  author    = {Paul, Apurba  and  Das, Dipankar},
  title     = {Identification of Character Adjectives from Mahabharata},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {569--576},
  abstract  = {The present paper describes the identification of prominent characters and
	their adjectives from Indian mythological epic, Mahabharata, written in English
	texts. However, in contrast to the tra-ditional approaches of named entity
	identifica-tion, the present system extracts hidden attributes associated with
	each of the characters (e.g., character adjectives). We observed distinct
	phrase level linguistic patterns that hint the pres-ence of characters in
	different text spans. Such six patterns were used in order to extract the
	cha-racters. On the other hand, a distinguishing set of novel features (e.g.,
	multi-word expression, nodes and paths of parse tree, immediate ancestors etc.)
	was employed. Further, the correlation of the features is also measured in
	order to identify the important features. Finally, we applied various machine
	learning algorithms (e.g., Naive Bayes, KNN, Logistic Regression, Decision
	Tree, Random Forest etc.) along with deep learning to classify the patterns as
	characters or non-characters in order to achieve decent accuracy. Evaluation
	shows that phrase level linguistic patterns as well as the adopted features are
	highly active in capturing characters and their adjectives.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_074}
}

@InProceedings{perezestruch-paredespalacios-rosso:2017:RANLP,
  author    = {P\'{e}rez Estruch, Carlos  and  Paredes Palacios, Roberto  and  Rosso, Paolo},
  title     = {Learning Multimodal Gender Profile using Neural Networks},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {577--582},
  abstract  = {Gender identification in social networks is one of the most popular aspects of
	user profile learning. Traditionally it has been linked to author profiling, a
	difficult problem to solve because of the little difference in the use of
	language between genders. This situation has led to the need of taking into
	account other information apart from textual data, favoring the emergence of
	multimodal data. The aim of this paper is to apply neural networks to perform
	data fusion, using an existing multimodal corpus, the NUS-MSS data set, that
	(not only) contains text data, but also image and location information. We
	improved previous results  in terms of macro accuracy (87.8%) obtaining the
	state-of-the-art performance of 91.3%.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_075}
}

@InProceedings{piasecki-mlynarczyk-kocon:2017:RANLP,
  author    = {Piasecki, Maciej  and  M{\l}ynarczyk, Ksenia  and  Koco\'{n}, Jan},
  title     = {Recognition of Genuine Polish Suicide Notes},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {583--591},
  abstract  = {In this article we present the result of the recent research in the recognition
	of genuine Polish suicide notes (SNs). We provide useful method to distinguish
	between SNs and other types of discourse, including counterfeited SNs. The
	method uses a wide range of word-based and semantic features and it was
	evaluated using Polish Corpus of Suicide Notes, which contains 1244 genuine
	SNs, expanded with manually prepared set of 334 counterfeited SNs and 2200
	letter-like texts from the Internet. We utilized the algorithm to create the
	class-related sense dictionaries to improve the result of SNs classification.
	The obtained results show that there are fundamental differences between
	genuine SNs and counterfeited SNs. The applied method of the sense dictionary
	construction appeared to be the best way of improving the model.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_076}
}

@InProceedings{pravzak-konopik:2017:RANLP,
  author    = {Pra\v{z}\'{a}k, Ond\v{r}ej  and  Konopik, Miloslav},
  title     = {Cross-Lingual SRL Based upon Universal Dependencies},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {592--600},
  abstract  = {In this paper, we introduce a cross-lingual Semantic Role Labeling (SRL) system
	with language independent features based upon Universal Dependencies. We
	propose two methods to convert SRL annotations from monolingual dependency
	trees into universal dependency trees. Our SRL system is based upon
	cross-lingual features derived from universal dependency trees and a supervised
	learning that utilizes a maximum entropy classifier. We design experiments to
	verify whether the Universal Dependencies are suitable for the cross-lingual
	SRL. The results are very promising and they open new interesting research
	paths for the future.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_077}
}

@InProceedings{rohanian-EtAl:2017:RANLP,
  author    = {Rohanian, Omid  and  Taslimipoor, Shiva  and  Yaneva, Victoria  and  Ha, Le An},
  title     = {Using Gaze Data to Predict Multiword Expressions},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {601--609},
  abstract  = {In recent years gaze data has been increasingly used to improve and evaluate
	NLP models due to the fact that it carries information about the cognitive
	processing of linguistic phenomena. In this paper we conduct a preliminary
	study
	towards the automatic identification of multiword expressions based on gaze
	features from native and non-native speakers of English. We report comparisons
	between a part-of-speech (POS) and frequency baseline to: i) a prediction model
	based solely on gaze data and ii) a combined model of gaze data, POS and
	frequency. In spite of the challenging nature of the task, best performance was
	achieved by the latter. Furthermore, we explore how the type of gaze data (from
	native versus non-native speakers) affects the prediction, showing that data
	from the two groups is discriminative to an equal degree for the task. Finally,
	we show that late processing measures are more predictive than early ones,
	which is in line with previous research on idioms and other formulaic
	structures.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_078}
}

@InProceedings{ruckle-gurevych:2017:RANLP,
  author    = {R\"{u}ckl\'{e}, Andreas  and  Gurevych, Iryna},
  title     = {Real-Time News Summarization with Adaptation to Media Attention},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {610--617},
  abstract  = {Real-time summarization of news events (RTS) allows persons to stay up-to-date
	on important topics that develop over time. With the occurrence of major
	sub-events, media attention increases and a large number of news articles are
	published. We propose a summarization approach that detects such changes and
	selects a suitable summarization configuration at run-time. In particular, at
	times with high media attention, our approach exploits the redundancy in
	content to  produce a more precise summary and avoid emitting redundant
	information. We find that our approach significantly outperforms a strong
	non-adaptive RTS baseline in terms of the emitted summary updates and achieves
	the best results on a recent web-scale dataset. It can successfully be applied
	to a different real-world dataset without requiring additional modifications.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_079}
}

@InProceedings{rudrapal-das:2017:RANLP,
  author    = {Rudrapal, Dwijen  and  Das, Amitava},
  title     = {Measuring the Limit of Semantic Divergence for English Tweets.},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {618--624},
  abstract  = {In human language, an expression could be conveyed in many ways by different
	people. Even that the same person may express same sentence quite differently
	when addressing different audiences, using different modalities, or using
	different syntactic variations or may use different set of vocabulary. The
	possibility of such endless surface form of  text while the meaning of the text
	remains almost same, poses many challenges for Natural Language Processing
	(NLP) systems like question-answering system, machine translation system and
	text summarization. This research paper is an endeavor to understand the
	characteristic of such endless semantic divergence. In this research work we
	develop a corpus of 1525 semantic divergent sentences for 200 English tweets.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_080}
}

@InProceedings{ruppenhofer-steiner-wiegand:2017:RANLP,
  author    = {Ruppenhofer, Josef  and  Steiner, Petra  and  Wiegand, Michael},
  title     = {Evaluating the morphological compositionality of polarity},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {625--633},
  abstract  = {Unknown words are a challenge for any NLP task, including sentiment analysis.
	Here, we evaluate the extent to which sentiment polarity of complex words can
	be  predicted based on their morphological make-up. We do this on German as it
	has very productive processes of derivation and compounding and many German
	hapax words, which are likely to bear sentiment,
	are morphologically complex. We present results of supervised classification
	experiments on new datasets with morphological parses and polarity annotations.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_081}
}

@InProceedings{rysova-EtAl:2017:RANLP,
  author    = {Rysov\'{a}, Kate\v{r}ina  and  Rysov\'{a}, Magdal\'{e}na  and  M\'{i}rovsk\'{y}, Ji\v{r}\'{i}  and  Nov\'{a}k, Michal},
  title     = {Introducing EVALD -- Software Applications for Automatic Evaluation of Discourse in Czech},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {634--641},
  abstract  = {In the paper, we introduce two software applications for automatic evaluation
	of coherence in Czech texts called EVALD -- Evaluator of Discourse. The first
	one -- EVALD 1.0 -- evaluates texts written by native speakers of Czech on a
	five-step scale commonly used at Czech schools (grade 1 is the best, grade 5 is
	the worst). The second application is EVALD 1.0 for Foreigners assessing texts
	by non-native speakers of Czech using six-step scale (A1--C2) according to
	CEFR. Both appli-cations are available online at 
	https://lindat.mff.cuni.cz/services/evald-foreign/.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_082}
}

@InProceedings{salton-ross-kelleher:2017:RANLP,
  author    = {Salton, Giancarlo  and  Ross, Robert  and  Kelleher, John},
  title     = {Idiom Type Identification with Smoothed Lexical Features and a Maximum Margin Classifier},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {642--651},
  abstract  = {In our work we address limitations in the state-of-the-art in idiom type
	identification. We investigate different approaches for a lexical fixedness
	metric, a component of the state-of the-art model. We also show that our
	Machine Learning based approach to the idiom type identification task achieves
	an F1-score of 0.85, an improvement of 11 points over the state-of the-art.
	Author{2}{Affiliation}},
  url       = {https://doi.org/10.26615/978-954-452-049-6_083}
}

@InProceedings{satthar-evans-uchyigit:2017:RANLP,
  author    = {Satthar, F.Sharmila  and  Evans, Roger  and  Uchyigit, Gulden},
  title     = {A Calibration Method for Evaluation of Sentiment Analysis},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {652--660},
  abstract  = {Sentiment analysis is the computational task of extracting sentiment from a
	text document --  for example whether it expresses a positive, negative or
	neutral opinion. Various approaches have been introduced in recent years, using
	a range of different techniques to extract sentiment information from a
	document. Measuring these methods against a gold standard dataset is a useful
	way to evaluate  such systems. However, different sentiment analysis techniques
	represent sentiment values in different ways, such as discrete categorical
	classes or continuous numerical sentiment scores. This creates a challenge for
	evaluating and comparing such systems; in particular assessing                       
	 
	numerical
	scores against datasets that use fixed classes is difficult, because the
	numerical outputs have to be mapped onto the ordered classes. This paper
	proposes a novel calibration technique that uses precision vs. recall curves to
	set class thresholds to optimize a continuous sentiment analyser's performance
	against a discrete gold standard dataset. In experiments mapping a continuous
	score onto a three-class classification of movie reviews, we show that
	calibration results in a substantial increase in f-score when compared to a
	non-calibrated mapping.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_084}
}

@InProceedings{semmar-laib:2017:RANLP,
  author    = {Semmar, Nasredine  and  Laib, Mariama},
  title     = {Building Multiword Expressions Bilingual Lexicons for Domain Adaptation of an Example-Based Machine Translation System},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {661--670},
  abstract  = {We describe in this paper a hybrid ap-proach to build automatically bilingual
	lexicons of Multiword Expressions (MWEs) from parallel corpora. We more
	specifically investigate the impact of using a domain-specific bilingual
	lexicon of MWEs on domain adaptation of an Example-Based Machine Translation
	(EBMT) system. We conducted experiments on the English-French language pair and
	two kinds of texts: in-domain texts from Europarl (European Parliament
	proceedings) and out-of-domain texts from Emea (European Medicines Agency
	documents) and Ecb (European Central Bank corpus). The obtained results
	indicate that integrating domain-specific bilingual lexicons of MWEs improves
	translation quality of the EBMT system when texts to translate are related to
	the specific domain and induces a relatively slight deterioration of
	translation quality when translating general-purpose texts.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_085}
}

@InProceedings{simaki-EtAl:2017:RANLP,
  author    = {Simaki, Vasiliki  and  Simakis, Panagiotis  and  Paradis, Carita  and  Kerren, Andreas},
  title     = {Identifying the Authors' National Variety of English in Social Media Texts},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {671--678},
  abstract  = {In this paper, we present a study for the identification of authors’ national
	variety of English in texts from social media. In data from Facebook and
	Twitter, information about the author’s social profile is annotated, and the
	national English variety (US, UK, AUS, CAN, NNS) that each author uses is
	attributed. We tested four feature types: formal linguistic features, POS
	features, lexicon-based features related to the different varieties, and
	data-based features from each English variety. We used various machine learning
	algorithms for the classification experiments, and we implemented a feature
	selection process. The classification accuracy achieved, when the 31 highest
	ranked features were used, was up to 77.32%. The experimental results are
	evaluated, and the efficacy of the ranked features discussed.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_086}
}

@InProceedings{simov-boytcheva-osenova:2017:RANLP,
  author    = {Simov, Kiril  and  Boytcheva, Svetla  and  Osenova, Petya},
  title     = {Towards Lexical Chains for Knowledge-Graph-based Word Embeddings},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {679--685},
  abstract  = {Word vectors with varying dimensionalities and produced by different algorithms
	have been extensively used in NLP. The corpora that the algorithms are trained
	on can contain either natural language text (e.g. Wikipedia or newswire
	articles) or artificially-generated pseudo corpora due to natural data
	sparseness.
	We exploit Lexical Chain based templates over Knowledge Graph for generating
	pseudo-corpora with controlled linguistic value. These corpora are then used
	for learning word embeddings. A number of experiments have been conducted over
	the following test sets: WordSim353 Similarity, WordSim353 Relatedness and
	SimLex-999.
	The results show that, on the one hand, the incorporation of many-relation
	lexical chains improves results, but on the other hand, unrestricted-length
	chains remain difficult to handle with respect to their huge quantity.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_087}
}

@InProceedings{simova-uszkoreit:2017:RANLP,
  author    = {Simova, Iliana  and  Uszkoreit, Hans},
  title     = {Word Embeddings as Features for Supervised Coreference Resolution},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {686--693},
  abstract  = {A common reason for errors in coreference resolution is the lack of semantic
	information to help determine the compatibility between mentions referring to
	the same entity. Distributed representations, which have been shown successful
	in encoding relatedness between words, could potentially be a good source of
	such knowledge. Moreover, being obtained in an unsupervised manner, they could
	help address data sparsity issues in labeled training data at a small cost. In
	this work we investigate whether and to what extend features derived from word
	embeddings can be successfully used for supervised coreference resolution. We
	experiment with several word embedding models, and several different types of
	embeddingbased features, including embedding cluster and cosine
	similarity-based features. Our evaluations show improvements in the performance
	of a supervised state-of-theart coreference system.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_088}
}

@InProceedings{steinberger-EtAl:2017:RANLP1,
  author    = {Steinberger, Josef  and  Brychc\'{i}n, Tom\'{a}\v{s}  and  Hercig, Tom\'{a}\v{s}  and  Krejzl, Peter},
  title     = {Cross-lingual Flames Detection in News Discussions},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {694--700},
  abstract  = {We introduce Flames Detector, an online system for measuring flames, i.e.
	strong negative feelings or emotions, insults or other verbal offences,
	in news commentaries across five languages. It is designed to assist
	journalists, public institutions or discussion moderators to detect news topics
	which evoke wrangles. 
	We propose a machine learning approach to flames detection and calculate an
	aggregated score for a set of comment threads.
	The demo application shows the most flaming topics of the current period in
	several language variants.
	The search functionality gives a possibility to measure flames in any topic
	specified by a query.
	The evaluation shows that the flame detection in discussions is a difficult
	task,
	however, the application can already reveal interesting information about the
	actual news discussions.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_089}
}

@InProceedings{steinberger-krejzl-brychcin:2017:RANLP,
  author    = {Steinberger, Josef  and  Krejzl, Peter  and  Brychc\'{i}n, Tom\'{a}\v{s}},
  title     = {Pyramid-based Summary Evaluation Using Abstract Meaning Representation},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {701--706},
  abstract  = {We propose a novel metric for evaluating
	summary content coverage. The evaluation
	framework follows the Pyramid approach
	to measure how many summarization
	content units, considered important by
	human annotators, are contained in an automatic
	summary. Our approach automatizes
	the evaluation process, which does not
	need any manual intervention on the evaluated
	summary side. Our approach compares
	abstract meaning representations of
	each content unit mention and each summary
	sentence. We found that the proposed
	metric complements well the widely-used
	ROUGE metrics.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_090}
}

@InProceedings{steinberger-EtAl:2017:RANLP2,
  author    = {Steinberger, Ralf  and  Hegele, Stefanie  and  Tanev, Hristo  and  della Rocca, Leonida},
  title     = {Large-scale news entity sentiment analysis},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {707--715},
  abstract  = {We work on detecting positive or negative sentiment towards named entities in
	very large volumes of news articles. The aim is to monitor changes over time,
	as well as to work towards media bias detection by com-paring differences
	across news sources and countries. With view to applying the same method to
	dozens of languages, we use lin-guistically light-weight methods: searching for
	positive and negative terms in bags of words around entity mentions (also
	consid-ering negation). Evaluation results are good and better than a
	third-party baseline sys-tem, but precision is not sufficiently high to display
	the results publicly in our multilin-gual news analysis system Europe Media
	Monitor (EMM). In this paper, we focus on describing our effort to improve the
	English language results by avoiding the biggest sources of errors. We also
	present new work on using a syntactic parser to identify safe opinion
	recognition rules, such as predica-tive structures in which sentiment words
	di-rectly refer to an entity. The precision of this method is good, but recall
	is very low.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_091}
}

@InProceedings{csulea-EtAl:2017:RANLP,
  author    = {\c{S}ulea, Octavia-Maria  and  Zampieri, Marcos  and  Vela, Mihaela  and  van Genabith, Josef},
  title     = {Predicting the Law Area and Decisions of French Supreme Court Cases},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {716--722},
  abstract  = {In this paper, we investigate the application of text classification methods to
	predict the law area and the decision of cases judged by the French Supreme
	Court. We also investigate the influence of the time period in which a ruling
	was made over the textual form of the case description and the extent to which
	it is necessary to mask the judge's motivation for a ruling to emulate a
	real-world test scenario. We report results of 96% f1 score in predicting a
	case ruling, 90% f1 score in predicting the law area of a case, and 75.9% f1
	score in estimating the time span when a ruling has been issued using a linear
	Support Vector Machine (SVM) classifier trained on lexical features.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_092}
}

@InProceedings{sumalvico:2017:RANLP,
  author    = {Sumalvico, Maciej},
  title     = {Unsupervised Learning of Morphology with Graph Sampling},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {723--732},
  abstract  = {We introduce a language-independent, graph-based probabilistic model of
	morphology, which uses transformation rules operating on whole words instead of
	the traditional morphological segmentation. The morphological analysis of a set
	of words is expressed through a graph having words as vertices and structural
	relationships between words as edges. We define a probability distribution over
	such graphs and develop a sampler based on the Metropolis-Hastings algorithm. 
	The sampling is applied in order to determine the strength of morphological
	relationships between words, filter out accidental similarities and reduce the
	set of rules necessary to explain the data. The model is evaluated on the task
	of finding pairs of morphologically similar words, as well as generating new
	words. The results are compared to a state-of-the-art segmentation-based
	approach.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_093}
}

@InProceedings{sweeney-padmanabhan:2017:RANLP,
  author    = {Sweeney, Colm  and  Padmanabhan, Deepak},
  title     = {Multi-entity sentiment analysis using entity-level feature extraction and word embeddings approach},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {733--740},
  abstract  = {The sentiment analysis task has been traditionally divided into lexicon or
	machine learning approaches, but recently the use of word embeddings methods
	have emerged, that provide powerful algorithms to allow semantic understanding
	without the task of creating large amounts of annotated test data. One problem
	with this type of binary classification, is that the sentiment output will be
	in the form of ‘1’ (positive) or ‘0’ (negative) for the string of text
	in the tweet, regardless if there are one or more entities referred to in the
	text. This paper plans to enhance the word embeddings approach with the
	deployment of a sentiment lexicon-based technique to appoint a total score that
	indicates the polarity of opinion in relation to a particular entity or
	entities. This type of sentiment classification is a way of associating a given
	entity with the adjectives, adverbs, and verbs describing it, and extracting
	the associated sentiment to try and infer if the text is positive or negative
	in relation to the entity or entities.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_094}
}

@InProceedings{tahmasebi-risse:2017:RANLP,
  author    = {Tahmasebi, Nina  and  Risse, Thomas},
  title     = {Finding Individual Word Sense Changes and their Delay in Appearance},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {741--749},
  abstract  = {We present a method for detecting word sense changes by utilizing automatically
	induced word senses. Our method works on the level of individual senses and
	allows a word to have e.g. one stable sense and then add a novel sense that
	later experiences change. Senses are grouped based on polysemy to find
	linguistic concepts and we can find broadening and narrowing as well as novel
	(polysemous and homonymic) senses. We evaluate on a testset, present recall and
	estimates of the time between expected and found change.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_095}
}

@InProceedings{thomas-EtAl:2017:RANLP,
  author    = {Thomas, Philippe  and  Kirschnick, Johannes  and  Hennig, Leonhard  and  Ai, Renlong  and  Schmeier, Sven  and  Hemsen, Holmer  and  Xu, Feiyu  and  Uszkoreit, Hans},
  title     = {Streaming Text Analytics for Real-Time Event Recognition},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {750--757},
  abstract  = {A huge body of continuously growing written knowledge is available on the web
	in the form of social media posts, RSS feeds, and news articles. Real-time
	information extraction from such high velocity, high volume text streams
	requires scalable, distributed natural language processing pipelines. We
	introduce such a system for fine-grained event recognition within the big data
	framework Flink, and demonstrate its capabilities for extracting and
	geo-locating mobility- and industry-related events from heterogeneous text
	sources. Performance analyses conducted on several large datasets show that our
	system achieves high throughput and maintains low latency, which is crucial
	when events need to be detected and acted upon in real-time. We also present
	promising experimental results for the event extraction component of our
	system, which recognizes a novel set of event types. The demo system is
	available at http://dfki.de/sd4m-sta-demo/.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_096}
}

@InProceedings{tokunaga-nishikawa-iwakura:2017:RANLP,
  author    = {Tokunaga, Takenobu  and  Nishikawa, Hitoshi  and  Iwakura, Tomoya},
  title     = {An Eye-tracking Study of Named Entity Annotation},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {758--764},
  abstract  = {Utilising effective features in machine learning-based natural language
	processing (NLP) is crucial in achieving good performance for a given NLP task.
	 The paper describes a pilot study on the analysis of eye-tracking data during
	named entity (NE) annotation, aiming at obtaining insights into effective
	features for the NE recognition task. The eye gaze data were collected from 10
	annotators and analysed regarding working time and fixation distribution.  The
	results of the preliminary qualitative analysis showed that human annotators
	tend to look at broader contexts around the target NE than recent
	state-of-the-art automatic NE recognition systems and to use predicate argument
	relations to identify the NE categories.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_097}
}

@InProceedings{tsekouras-varlamis-giannakopoulos:2017:RANLP,
  author    = {Tsekouras, Leonidas  and  Varlamis, Iraklis  and  Giannakopoulos, George},
  title     = {A Graph-based Text Similarity Measure That Employs Named Entity Information},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {765--771},
  abstract  = {Text comparison is an interesting though hard task, with many applications in
	Natural Language Processing. This work introduces a new text-similarity
	measure, which employs named-entities' information extracted from the texts and
	the n-gram graphs' model for representing documents. Using OpenCalais as a
	named-entity recognition service and the JINSECT toolkit for constructing and
	managing n-gram graphs, the text similarity measure is embedded in a text
	clustering algorithm (k-Means). The evaluation of the produced clusters with
	various clustering validity metrics shows that the extraction of named entities
	at a first step can be profitable for the time-performance of similarity
	measures that are based on the n-gram graph representation without affecting
	the overall performance of the NLP task.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_098}
}

@InProceedings{wawer-mykowiecka:2017:RANLP,
  author    = {Wawer, Aleksander  and  Mykowiecka, Agnieszka},
  title     = {Detecting Metaphorical Phrases in the Polish Language},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {772--777},
  abstract  = {In this paper we describe experiments with automated detection of metaphors in
	the Polish language. We focus our analysis on noun phrases composed of an
	adjective and a noun, and distinguish three types of expressions: with literal
	sense, with metaphorical sense, and expressions both literal and methaphorical
	(context-dependent). We propose a method of automatically recognizing
	expression type using word embeddings and neural networks. We evaluate multiple
	neural network architectures and demonstrate that the method significantly
	outperforms strong baselines.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_099}
}

@InProceedings{weegar-nygrard-dalianis:2017:RANLP,
  author    = {Weegar, Rebecka  and  Nyg\r{a}rd, Jan F  and  Dalianis, Hercules},
  title     = {Efficient Encoding of Pathology Reports Using Natural Language Processing},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {778--783},
  abstract  = {In this article we present a system that extracts information from pathology
	reports. The reports are written in Norwegian and contain free text describing
	prostate biopsies. Currently, these reports are manually coded for research and
	statistical purposes by trained experts at the Cancer Registry of Norway where
	the coders extract values for a set of predefined fields that are specific for
	prostate cancer.
	The presented system is rule based and achieves an average F-score of 0.91 for
	the fields Gleason grade, Gleason score, the number of biopsies that contain
	tumor tissue, and the orientation of the biopsies. The system also identifies
	reports that contain ambiguity or other content that should be reviewed by an
	expert. The system shows potential to encode the reports considerably faster,
	with less resources, and similar high quality to the manual encoding.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_100}
}

@InProceedings{yang-zhang-dong:2017:RANLP,
  author    = {Yang, Jie  and  Zhang, Yue  and  Dong, Fei},
  title     = {Neural Reranking for Named Entity Recognition},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {784--792},
  abstract  = {We propose a neural reranking system for named entity recognition (NER),
	leverages recurrent neural network models to learn sentence-level patterns that
	involve named entity mentions. In particular, given an output sentence produced
	by a baseline NER model, we replace all entity mentions, such as \textit{Barack
	Obama}, into their entity types, such as \textit{PER}. The resulting sentence
	patterns contain direct output information, yet is less sparse without specific
	named entities. For example, ``PER was born in LOC'' can be such a pattern.
	LSTM and CNN structures are utilised for learning deep representations of such
	sentences for reranking. Results show that our system can significantly improve
	the NER accuracies over two different baselines, giving the best reported
	results on a standard benchmark.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_101}
}

@InProceedings{yao-EtAl:2017:RANLP,
  author    = {Yao, Wenlin  and  Dai, Zeyu  and  Huang, Ruihong  and  Caverlee, James},
  title     = {Online Deception Detection Refueled by Real World Data Collection},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {793--802},
  abstract  = {The lack of large realistic datasets presents a bottleneck in online deception
	detection studies. In this paper, we apply a data collection method based on
	social network analysis to quickly identify high quality deceptive and truthful
	online reviews1 from Amazon. The dataset contains more than 10,000 deceptive
	reviews and is diverse in product domains and reviewers. Using this dataset, we
	explore effective general features for online deception detection that perform
	well across domains. We demonstrate that with generalized features --
	advertising speak and writing complexity scores -- deception detection
	performance can be further improved by adding additional deceptive reviews from
	assorted domains in training. Finally, reviewer level evaluation gives an
	interesting insight into different deceptive reviewers’ writing styles.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_102}
}

@InProceedings{yao-nettyam-huang:2017:RANLP,
  author    = {Yao, Wenlin  and  Nettyam, Saipravallika  and  Huang, Ruihong},
  title     = {A Weakly Supervised Approach to Train Temporal Relation Classifiers and Acquire Regular Event Pairs Simultaneously},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {803--812},
  abstract  = {Capabilities of detecting temporal and causal relations between two events can
	benefit many applications. Most of existing temporal relation classifiers 
	were trained in a supervised  manner. Instead, we explore the observation
	that regular event pairs show a consistent temporal relation despite of their
	various contexts and these rich contexts can be used to train a contextual
	temporal relation classifier, which can further recognize new temporal relation
	contexts and identify new regular event pairs. We focus on detecting after and
	before temporal relations and design a weakly supervised learning approach that
	extracts thousands of regular event pairs and learns a contextual temporal
	relation classifier simultaneously. Evaluation shows that the acquired regular
	event pairs are of high quality and contain rich commonsense knowledge and
	domain specific knowledge. In addition, the weakly supervised  trained temporal
	relation  classifier achieves comparable performance with the state-of-the-art
	supervised systems.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_103}
}

@InProceedings{yimam-EtAl:2017:RANLP,
  author    = {Yimam, Seid Muhie  and  \v{S}tajner, Sanja  and  Riedl, Martin  and  Biemann, Chris},
  title     = {Multilingual and Cross-Lingual Complex Word Identification},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {813--822},
  abstract  = {Complex Word Identification (CWI) is an important task in lexical
	simplification and text accessibility. Due to the lack of CWI datasets,
	previous works largely depend on Simple English Wikipedia and edit histories
	for obtaining ‘gold standard’ annotations, which are of doubtable quality,
	and limited only to English. We collect complex words/phrases (CP) for English,
	German and Spanish, annotated by both native and non-native speakers, and
	propose language independent features that can be used to train multilingual
	and cross-lingual CWI models. We show that the performance of cross-lingual CWI
	systems (using a model trained on one language and applying it on the other
	languages) is comparable to the performance of monolingual CWI systems.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_104}
}

@InProceedings{yordanova:2017:RANLP1,
  author    = {Yordanova, Kristina},
  title     = {Automatic Generation of Situation Models for Plan Recognition Problems},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {823--830},
  abstract  = {Recent attempts at behaviour understanding through language grounding have
	shown that it is possible to automatically generate models for planning
	problems from textual instructions.
	One drawback of these approaches is that they either do not make use of the
	semantic structure behind the model elements identified in the text, or they
	manually incorporate a collection of concepts with semantic relationships
	between them. 
	We call this collection of knowledge situation model. 
	The situation model introduces additional context information to the model.
	It could also potentially reduce the complexity of the planning problem
	compared to models that do not use situation models.
	To address this problem, we propose an approach that automatically generates
	the situation model from textual instructions. 
	The approach is able to identify various hierarchical, spatial, directional,
	and causal relations. 
	We use the situation model to automatically generate planning problems in a
	PDDL notation and we show that the situation model reduces the complexity of
	the PDDL model in terms of number of operators and branching factor compared to
	planning models that do not make use of situation models.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_105}
}

@InProceedings{yordanova:2017:RANLP2,
  author    = {Yordanova, Kristina},
  title     = {A Simple Model for Improving the Performance of the Stanford Parser for Action Detection in Textual Instructions},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {831--838},
  abstract  = {Different approaches for behaviour understanding rely on textual instructions
	to generate models of human behaviour. 
	These approaches usually use state of the art parsers to obtain the part of
	speech (POS) meaning and dependencies of the words in the instructions.
	For them it is essential that the parser is able to correctly annotate the
	instructions and especially the verbs as they describe the actions of the
	person.
	State of the art parsers usually make errors when annotating textual
	instructions, as they have short sentence structure often in imperative form.
	 The inability of the parser to identify the verbs results in the inability of
	behaviour understanding systems to identify the relevant actions.
	To address this problem, we propose a simple rule-based model that attempts to
	correct any incorrectly annotated verbs.
	We argue that the model is able to significantly improve the parser's
	performance without the need of additional training data. 
	We evaluate our approach by extracting the actions from 61 textual instructions
	annotated only with the Stanford parser and once again after applying our
	model. 
	The results show a significant improvement in the recognition rate when
	applying the rules (75% accuracy compared to 68% without the rules, p-value <
	0.001).},
  url       = {https://doi.org/10.26615/978-954-452-049-6_106}
}

@InProceedings{zilio-wilkens-fairon:2017:RANLP,
  author    = {Zilio, Leonardo  and  Wilkens, Rodrigo  and  Fairon, C\'{e}drick},
  title     = {Using NLP for Enhancing Second Language Acquisition},
  booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017},
  month     = {September},
  year      = {2017},
  address   = {Varna, Bulgaria},
  publisher = {INCOMA Ltd.},
  pages     = {839--846},
  abstract  = {This study presents SMILLE, a system that draws on the Noticing Hypothesis and
	on input enhancements, addressing the lack of salience of grammatical infor
	mation in online documents chosen by a given user. By means of input
	enhancements, the system can draw the user’s attention to grammar, which
	could possibly lead to a higher intake per input ratio for metalinguistic
	information. The system receives as input an online document and submits it to
	a combined processing of parser and hand-written rules for detecting its
	grammatical structures. The input text can be freely chosen by the user,
	providing a more engaging experience and reflecting the user’s interests. The
	system can enhance a total of 107 fine-grained types of grammatical structures
	that are based on the CEFR. An evaluation of some of those structures resulted
	in an overall precision of 87%.},
  url       = {https://doi.org/10.26615/978-954-452-049-6_107}
}