@inproceedings{L16-1471,
 abstract = {This paper presents an approach for building large monolingual corpora and, at the same time, extracting parallel data by crawling the top-level domain of a given language of interest. For gathering linguistically relevant data from top-level domains we use the SpiderLing crawler, modified to crawl data written in multiple languages. The output of this process is then fed to Bitextor, a tool for harvesting parallel data from a collection of documents. We call the system combining these two tools Spidextor, a blend of the names of its two crucial parts. We evaluate the described approach intrinsically by measuring the accuracy of the extracted bitexts from the Croatian top-level domain ".hr" and the Slovene top-level domain ".si", and extrinsically on the English-Croatian language pair by comparing an SMT system built from the crawled data with third-party systems. We finally present parallel datasets collected with our approach for the English-Croatian, English-Finnish, English-Serbian and English-Slovene language pairs.
},
 address = {Portorož, Slovenia},
 author = {Nikola Ljubešić and Miquel Esplà-Gomis and Antonio Toral and Sergio Ortiz Rojas and Filip Klubička},
 booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
 month = {May},
 pages = {2949--2956},
 publisher = {European Language Resources Association (ELRA)},
 title = {Producing Monolingual and Parallel Web Corpora at the Same Time - SpiderLing and Bitextor's Love Affair},
 url = {https://www.aclweb.org/anthology/L16-1471},
 year = {2016}
}

