@InProceedings{rangel-EtAl:2018:W18-16,
  author    = {Rangel, Francisco  and  Rosso, Paolo  and  Brooke, Julian  and  Uitdenbogerd, Alexandra},
  title     = {Cross-corpus Native Language Identification via Statistical Embedding},
  booktitle = {Proceedings of the Second Workshop on Stylistic Variation},
  month     = {June},
  year      = {2018},
  address   = {New Orleans},
  publisher = {Association for Computational Linguistics},
  pages     = {39--43},
  abstract  = {In this paper, we approach the task of native language identification in a realistic cross-corpus scenario where a model is trained with available data and has to predict the native language from data of a different corpus. The motivation behind this study is to investigate native language identification in the Australian academic scenario where a majority of students come from China, Indonesia, and Arabic-speaking nations. We have proposed a statistical embedding representation reporting a significant improvement over common single-layer approaches of the state of the art, identifying Chinese, Arabic, and Indonesian in a cross-corpus scenario. The proposed approach was shown to be competitive even when the data is scarce and imbalanced.},
  url       = {http://www.aclweb.org/anthology/W18-1605}
}

