@InProceedings{albertsson-rennes-jonsson:2016:CL4LC,
  author    = {Albertsson, Sarah  and  Rennes, Evelina  and  Jonsson, Arne},
  title     = {Similarity-Based Alignment of Monolingual Corpora for Text Simplification Purposes},
  booktitle = {Proceedings of the Workshop on Computational Linguistics for Linguistic Complexity (CL4LC)},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {154--163},
  abstract  = {Comparable or parallel corpora are beneficial for many NLP tasks. The automatic
	collection of corpora enables large-scale resources, even for less-resourced
	languages, which in turn can be useful for deducing rules and patterns for text
	rewriting algorithms, a subtask of automatic text simplification.
	 We present two methods for the alignment of Swedish easy-to-read text segments
	to text segments from a reference corpus. The first method (M1) was originally
	developed for the task of text reuse detection, measuring sentence similarity
	by a modified version of a TF-IDF vector space model. A second method (M2),
	also accounting for part-of-speech tags, was developed, and the methods were
	compared.
	For evaluation, a crowdsourcing platform was built for human judgement data
	collection, and preliminary results showed that cosine similarity relates
	better to human ranks than the Dice coefficient. We also saw a tendency that
	including syntactic context to the TF-IDF vector space model is beneficial for
	this kind of paraphrase alignment task.},
  url       = {http://aclweb.org/anthology/W16-4118}
}

