@inproceedings{thompson-koehn-2019-vecalign,
title = "{V}ecalign: Improved Sentence Alignment in Linear Time and Space",
author = "Thompson, Brian and
Koehn, Philipp",
editor = "Inui, Kentaro and
Jiang, Jing and
Ng, Vincent and
Wan, Xiaojun",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D19-1136",
doi = "10.18653/v1/D19-1136",
pages = "1342--1348",
abstract = "We introduce Vecalign, a novel bilingual sentence alignment method which is linear in time and space with respect to the number of sentences being aligned and which requires only bilingual sentence embeddings. On a standard German{--}French test set, Vecalign outperforms the previous state-of-the-art method (which has quadratic time complexity and requires a machine translation system) by 5 F1 points. It substantially outperforms the popular Hunalign toolkit at recovering Bible verse alignments in medium- to low-resource language pairs, and it improves downstream MT quality by 1.7 and 1.6 BLEU in Sinhala-English and Nepali-English, respectively, compared to the Hunalign-based Paracrawl pipeline.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="thompson-koehn-2019-vecalign">
<titleInfo>
<title>Vecalign: Improved Sentence Alignment in Linear Time and Space</title>
</titleInfo>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Thompson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vincent</namePart>
<namePart type="family">Ng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaojun</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hong Kong, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce Vecalign, a novel bilingual sentence alignment method which is linear in time and space with respect to the number of sentences being aligned and which requires only bilingual sentence embeddings. On a standard German–French test set, Vecalign outperforms the previous state-of-the-art method (which has quadratic time complexity and requires a machine translation system) by 5 F1 points. It substantially outperforms the popular Hunalign toolkit at recovering Bible verse alignments in medium- to low-resource language pairs, and it improves downstream MT quality by 1.7 and 1.6 BLEU in Sinhala-English and Nepali-English, respectively, compared to the Hunalign-based Paracrawl pipeline.</abstract>
<identifier type="citekey">thompson-koehn-2019-vecalign</identifier>
<identifier type="doi">10.18653/v1/D19-1136</identifier>
<location>
<url>https://aclanthology.org/D19-1136</url>
</location>
<part>
<date>2019-11</date>
<extent unit="page">
<start>1342</start>
<end>1348</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Vecalign: Improved Sentence Alignment in Linear Time and Space
%A Thompson, Brian
%A Koehn, Philipp
%Y Inui, Kentaro
%Y Jiang, Jing
%Y Ng, Vincent
%Y Wan, Xiaojun
%S Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)
%D 2019
%8 November
%I Association for Computational Linguistics
%C Hong Kong, China
%F thompson-koehn-2019-vecalign
%X We introduce Vecalign, a novel bilingual sentence alignment method which is linear in time and space with respect to the number of sentences being aligned and which requires only bilingual sentence embeddings. On a standard German–French test set, Vecalign outperforms the previous state-of-the-art method (which has quadratic time complexity and requires a machine translation system) by 5 F1 points. It substantially outperforms the popular Hunalign toolkit at recovering Bible verse alignments in medium- to low-resource language pairs, and it improves downstream MT quality by 1.7 and 1.6 BLEU in Sinhala-English and Nepali-English, respectively, compared to the Hunalign-based Paracrawl pipeline.
%R 10.18653/v1/D19-1136
%U https://aclanthology.org/D19-1136
%U https://doi.org/10.18653/v1/D19-1136
%P 1342-1348
Markdown (Informal)
[Vecalign: Improved Sentence Alignment in Linear Time and Space](https://aclanthology.org/D19-1136) (Thompson & Koehn, EMNLP-IJCNLP 2019)
ACL
- Brian Thompson and Philipp Koehn. 2019. Vecalign: Improved Sentence Alignment in Linear Time and Space. In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pages 1342–1348, Hong Kong, China. Association for Computational Linguistics.