@inproceedings{L16-1712,
 abstract = {In this paper, I describe a method of creating massively huge web corpora from the CommonCrawl data sets and redistributing the resulting annotations in a stand-off format. Current EU (and especially German) copyright legislation categorically forbids the redistribution of downloaded material without express prior permission by the authors. Therefore, such stand-off annotations (or other derivates) are the only format in which European researchers (like myself) are allowed to re-distribute the respective corpora. In order to make the full corpora available to the public despite such restrictions, the stand-off format presented here allows anybody to locally reconstruct the full corpora with the least possible computational effort.
},
 address = {Portorož, Slovenia},
 author = {Roland Schäfer},
 booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
 month = {May},
 pages = {4500--4504},
 publisher = {European Language Resources Association (ELRA)},
 title = {CommonCOW: Massively Huge Web Corpora from CommonCrawl Data and a Method to Distribute them Freely under Restrictive EU Copyright Laws},
 url = {https://www.aclweb.org/anthology/L16-1712},
 year = {2016}
}

