@inproceedings{thompson-koehn-2020-exploiting,
    title = "Exploiting Sentence Order in Document Alignment",
    author = "Thompson, Brian  and
      Koehn, Philipp",
    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
    month = nov,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2020.emnlp-main.483",
    doi = "10.18653/v1/2020.emnlp-main.483",
    pages = "5997--6007",
    abstract = "We present a simple document alignment method that incorporates sentence order information in both candidate generation and candidate re-scoring. Our method results in 61{\%} relative reduction in error compared to the best previously published result on the WMT16 document alignment shared task. Our method improves downstream MT performance on web-scraped Sinhala{--}English documents from ParaCrawl, outperforming the document alignment method used in the most recent ParaCrawl release. It also outperforms a comparable corpora method which uses the same multilingual embeddings, demonstrating that exploiting sentence order is beneficial even if the end goal is sentence-level bitext.",
}
