@inproceedings{L16-1042,
 abstract = {Web corpora are often constructed automatically, and their contents are therefore often not well understood. One technique for assessing the composition of such a web corpus is to empirically measure its similarity to a reference corpus whose composition is known. In this paper we evaluate a number of measures of corpus similarity, including a method based on topic modelling which has not been previously evaluated for this task. To evaluate these methods we use known-similarity corpora that have been previously used for this purpose, as well as a number of newly-constructed known-similarity corpora targeting differences in genre, topic, time, and region. Our findings indicate that, overall, the topic modelling approach did not improve on a chi-square method that had previously been found to work well for measuring corpus similarity.
},
 address = {Portorož, Slovenia},
 author = {Richard Fothergill and Paul Cook and Timothy Baldwin},
 booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
 month = {May},
 pages = {273--279},
 publisher = {European Language Resources Association (ELRA)},
 title = {Evaluating a Topic Modelling Approach to Measuring Corpus Similarity},
 url = {https://www.aclweb.org/anthology/L16-1042},
 year = {2016}
}

