@inproceedings{jalili-sabet-etal-2020-simalign,
title = "{S}im{A}lign: High Quality Word Alignments Without Parallel Training Data Using Static and Contextualized Embeddings",
author = {Jalili Sabet, Masoud and
Dufter, Philipp and
Yvon, Fran{\c{c}}ois and
Sch{\"u}tze, Hinrich},
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.findings-emnlp.147",
doi = "10.18653/v1/2020.findings-emnlp.147",
pages = "1627--1643",
abstract = "Word alignments are useful for tasks like statistical and neural machine translation (NMT) and cross-lingual annotation projection. Statistical word aligners perform well, as do methods that extract alignments jointly with translations in NMT. However, most approaches require parallel training data and quality decreases as less training data is available. We propose word alignment methods that require no parallel data. The key idea is to leverage multilingual word embeddings {--} both static and contextualized {--} for word alignment. Our multilingual embeddings are created from monolingual data only without relying on any parallel data or dictionaries. We find that alignments created from embeddings are superior for four and comparable for two language pairs compared to those produced by traditional statistical aligners {--} even with abundant parallel data; e.g., contextualized embeddings achieve a word alignment F1 for English-German that is 5 percentage points higher than eflomal, a high-quality statistical aligner, trained on 100k parallel sentences.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jalili-sabet-etal-2020-simalign">
<titleInfo>
<title>SimAlign: High Quality Word Alignments Without Parallel Training Data Using Static and Contextualized Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Masoud</namePart>
<namePart type="family">Jalili Sabet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Dufter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">François</namePart>
<namePart type="family">Yvon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hinrich</namePart>
<namePart type="family">Schütze</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2020</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Word alignments are useful for tasks like statistical and neural machine translation (NMT) and cross-lingual annotation projection. Statistical word aligners perform well, as do methods that extract alignments jointly with translations in NMT. However, most approaches require parallel training data and quality decreases as less training data is available. We propose word alignment methods that require no parallel data. The key idea is to leverage multilingual word embeddings – both static and contextualized – for word alignment. Our multilingual embeddings are created from monolingual data only without relying on any parallel data or dictionaries. We find that alignments created from embeddings are superior for four and comparable for two language pairs compared to those produced by traditional statistical aligners – even with abundant parallel data; e.g., contextualized embeddings achieve a word alignment F1 for English-German that is 5 percentage points higher than eflomal, a high-quality statistical aligner, trained on 100k parallel sentences.</abstract>
<identifier type="citekey">jalili-sabet-etal-2020-simalign</identifier>
<identifier type="doi">10.18653/v1/2020.findings-emnlp.147</identifier>
<location>
<url>https://aclanthology.org/2020.findings-emnlp.147</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>1627</start>
<end>1643</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SimAlign: High Quality Word Alignments Without Parallel Training Data Using Static and Contextualized Embeddings
%A Jalili Sabet, Masoud
%A Dufter, Philipp
%A Yvon, François
%A Schütze, Hinrich
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Findings of the Association for Computational Linguistics: EMNLP 2020
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F jalili-sabet-etal-2020-simalign
%X Word alignments are useful for tasks like statistical and neural machine translation (NMT) and cross-lingual annotation projection. Statistical word aligners perform well, as do methods that extract alignments jointly with translations in NMT. However, most approaches require parallel training data and quality decreases as less training data is available. We propose word alignment methods that require no parallel data. The key idea is to leverage multilingual word embeddings – both static and contextualized – for word alignment. Our multilingual embeddings are created from monolingual data only without relying on any parallel data or dictionaries. We find that alignments created from embeddings are superior for four and comparable for two language pairs compared to those produced by traditional statistical aligners – even with abundant parallel data; e.g., contextualized embeddings achieve a word alignment F1 for English-German that is 5 percentage points higher than eflomal, a high-quality statistical aligner, trained on 100k parallel sentences.
%R 10.18653/v1/2020.findings-emnlp.147
%U https://aclanthology.org/2020.findings-emnlp.147
%U https://doi.org/10.18653/v1/2020.findings-emnlp.147
%P 1627-1643
Markdown (Informal)
[SimAlign: High Quality Word Alignments Without Parallel Training Data Using Static and Contextualized Embeddings](https://aclanthology.org/2020.findings-emnlp.147) (Jalili Sabet et al., Findings 2020)
ACL