@inproceedings{koerner-koehn-2020-dual,
    title = "Dual Conditional Cross Entropy Scores and {LASER} Similarity Scores for the {WMT}20 Parallel Corpus Filtering Shared Task",
    author = "Koerner, Felicia  and
      Koehn, Philipp",
    booktitle = "Proceedings of the Fifth Conference on Machine Translation",
    month = nov,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2020.wmt-1.109",
    pages = "966--971",
    abstract = "This paper describes our submission to the WMT20 Parallel Corpus Filtering and Alignment for Low-Resource Conditions Shared Task. This year{'}s corpora are noisy Khmer-English and Pashto-English, with 58.3 million and 11.6 million words respectively (English token count). Our submission focuses on filtering Pashto-English, building on previously successful methods to produce two sets of scores: LASER{\_}LM, a combination of the LASER similarity scores provided in the shared task and perplexity scores from language models, and DCCEF{\_}DUP, dual conditional cross entropy scores combined with a duplication penalty. We improve slightly on the LASER similarity score and find that the provided clean data can successfully be supplemented with a subsampled set of the noisy data, effectively increasing the training data for the models used for dual conditional cross entropy scoring.",
}
