@inproceedings{vazquez-etal-2021-differences,
title = "On the differences between {BERT} and {MT} encoder spaces and how to address them in translation tasks",
author = {V{\'a}zquez, Ra{\'u}l and
Celikkanat, Hande and
Creutz, Mathias and
Tiedemann, J{\"o}rg},
editor = "Kabbara, Jad and
Lin, Haitao and
Paullada, Amandalynne and
Vamvas, Jannis",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: Student Research Workshop",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.acl-srw.35",
doi = "10.18653/v1/2021.acl-srw.35",
pages = "337--347",
abstract = "Various studies show that pretrained language models such as BERT cannot straightforwardly replace encoders in neural machine translation despite their enormous success in other tasks. This is even more astonishing considering the similarities between the architectures. This paper sheds some light on the embedding spaces they create, using average cosine similarity, contextuality metrics and measures for representational similarity for comparison, revealing that BERT and NMT encoder representations look significantly different from one another. In order to address this issue, we propose a supervised transformation from one into the other using explicit alignment and fine-tuning. Our results demonstrate the need for such a transformation to improve the applicability of BERT in MT.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vazquez-etal-2021-differences">
<titleInfo>
<title>On the differences between BERT and MT encoder spaces and how to address them in translation tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Raúl</namePart>
<namePart type="family">Vázquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hande</namePart>
<namePart type="family">Celikkanat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mathias</namePart>
<namePart type="family">Creutz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jad</namePart>
<namePart type="family">Kabbara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haitao</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amandalynne</namePart>
<namePart type="family">Paullada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jannis</namePart>
<namePart type="family">Vamvas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Various studies show that pretrained language models such as BERT cannot straightforwardly replace encoders in neural machine translation despite their enormous success in other tasks. This is even more astonishing considering the similarities between the architectures. This paper sheds some light on the embedding spaces they create, using average cosine similarity, contextuality metrics and measures for representational similarity for comparison, revealing that BERT and NMT encoder representations look significantly different from one another. In order to address this issue, we propose a supervised transformation from one into the other using explicit alignment and fine-tuning. Our results demonstrate the need for such a transformation to improve the applicability of BERT in MT.</abstract>
<identifier type="citekey">vazquez-etal-2021-differences</identifier>
<identifier type="doi">10.18653/v1/2021.acl-srw.35</identifier>
<location>
<url>https://aclanthology.org/2021.acl-srw.35</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>337</start>
<end>347</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the differences between BERT and MT encoder spaces and how to address them in translation tasks
%A Vázquez, Raúl
%A Celikkanat, Hande
%A Creutz, Mathias
%A Tiedemann, Jörg
%Y Kabbara, Jad
%Y Lin, Haitao
%Y Paullada, Amandalynne
%Y Vamvas, Jannis
%S Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: Student Research Workshop
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F vazquez-etal-2021-differences
%X Various studies show that pretrained language models such as BERT cannot straightforwardly replace encoders in neural machine translation despite their enormous success in other tasks. This is even more astonishing considering the similarities between the architectures. This paper sheds some light on the embedding spaces they create, using average cosine similarity, contextuality metrics and measures for representational similarity for comparison, revealing that BERT and NMT encoder representations look significantly different from one another. In order to address this issue, we propose a supervised transformation from one into the other using explicit alignment and fine-tuning. Our results demonstrate the need for such a transformation to improve the applicability of BERT in MT.
%R 10.18653/v1/2021.acl-srw.35
%U https://aclanthology.org/2021.acl-srw.35
%U https://doi.org/10.18653/v1/2021.acl-srw.35
%P 337-347
Markdown (Informal)
[On the differences between BERT and MT encoder spaces and how to address them in translation tasks](https://aclanthology.org/2021.acl-srw.35) (Vázquez et al., ACL-IJCNLP 2021)
ACL