@InProceedings{wagnerfilho-wilkens-villavicencio:2016:CL4LC,
  author    = {Wagner Filho, Jorge Alberto  and  Wilkens, Rodrigo  and  Villavicencio, Aline},
  title     = {Automatic Construction of Large Readability Corpora},
  booktitle = {Proceedings of the Workshop on Computational Linguistics for Linguistic Complexity (CL4LC)},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {164--173},
  abstract  = {This work presents a framework for the automatic construction of large Web
	corpora classified by readability level. We compare different Machine Learning
	classifiers for the task of readabil- ity assessment focusing on Portuguese and
	English texts, analysing the impact of variables like the feature inventory
	used in the resulting corpus. In a comparison between shallow and deeper
	features, the former already produce F-measures of over 0.75 for Portuguese
	texts, but the use of additional features results in even better results, in
	most cases. For English, shallow features also perform well as do classic
	readability formulas. Comparing different classifiers for the task, logistic
	regression obtained, in general, the best results, but with considerable
	differences be- tween the results for two and those for three-classes,
	especially regarding the intermediary class. Given the large scale of the
	resulting corpus, for evaluation we adopt the agreement between different
	classifiers as an indication of readability assessment certainty. As a result
	of this work, a large corpus for Brazilian Portuguese was built, including 1.7
	million documents and about 1.6 billion tokens, already parsed and annotated
	with 134 different textual attributes, along with the agreement among the
	various classifiers.},
  url       = {http://aclweb.org/anthology/W16-4119}
}

