@inproceedings{xu-etal-2021-probing,
title = "Probing Word Translations in the Transformer and Trading Decoder for Encoder Layers",
author = "Xu, Hongfei and
van Genabith, Josef and
Liu, Qiuhui and
Xiong, Deyi",
editor = "Toutanova, Kristina and
Rumshisky, Anna and
Zettlemoyer, Luke and
Hakkani-Tur, Dilek and
Beltagy, Iz and
Bethard, Steven and
Cotterell, Ryan and
Chakraborty, Tanmoy and
Zhou, Yichao",
booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.naacl-main.7/",
doi = "10.18653/v1/2021.naacl-main.7",
pages = "74--85",
abstract = "Due to its effectiveness and performance, the Transformer translation model has attracted wide attention, most recently in terms of probing-based approaches. Previous work focuses on using or probing source linguistic features in the encoder. To date, the way word translation evolves in Transformer layers has not yet been investigated. Naively, one might assume that encoder layers capture source information while decoder layers translate. In this work, we show that this is not quite the case: translation already happens progressively in encoder layers and even in the input embeddings. More surprisingly, we find that some of the lower decoder layers do not actually do that much decoding. We show all of this in terms of a probing approach where we project representations of the layer analyzed to the final trained and frozen classifier level of the Transformer decoder to measure word translation accuracy. Our findings motivate and explain a Transformer configuration change: if translation already happens in the encoder layers, perhaps we can increase the number of encoder layers, while decreasing the number of decoder layers, boosting decoding speed, without loss in translation quality? Our experiments show that this is indeed the case: we can increase speed by up to a factor 2.3 with small gains in translation quality, while an 18-4 deep encoder configuration boosts translation quality by +1.42 BLEU (En-De) at a speed-up of 1.4."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xu-etal-2021-probing">
<titleInfo>
<title>Probing Word Translations in the Transformer and Trading Decoder for Encoder Layers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hongfei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Josef</namePart>
<namePart type="family">van Genabith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qiuhui</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deyi</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kristina</namePart>
<namePart type="family">Toutanova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rumshisky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luke</namePart>
<namePart type="family">Zettlemoyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dilek</namePart>
<namePart type="family">Hakkani-Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iz</namePart>
<namePart type="family">Beltagy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yichao</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Due to its effectiveness and performance, the Transformer translation model has attracted wide attention, most recently in terms of probing-based approaches. Previous work focuses on using or probing source linguistic features in the encoder. To date, the way word translation evolves in Transformer layers has not yet been investigated. Naively, one might assume that encoder layers capture source information while decoder layers translate. In this work, we show that this is not quite the case: translation already happens progressively in encoder layers and even in the input embeddings. More surprisingly, we find that some of the lower decoder layers do not actually do that much decoding. We show all of this in terms of a probing approach where we project representations of the layer analyzed to the final trained and frozen classifier level of the Transformer decoder to measure word translation accuracy. Our findings motivate and explain a Transformer configuration change: if translation already happens in the encoder layers, perhaps we can increase the number of encoder layers, while decreasing the number of decoder layers, boosting decoding speed, without loss in translation quality? Our experiments show that this is indeed the case: we can increase speed by up to a factor 2.3 with small gains in translation quality, while an 18-4 deep encoder configuration boosts translation quality by +1.42 BLEU (En-De) at a speed-up of 1.4.</abstract>
<identifier type="citekey">xu-etal-2021-probing</identifier>
<identifier type="doi">10.18653/v1/2021.naacl-main.7</identifier>
<location>
<url>https://aclanthology.org/2021.naacl-main.7/</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>74</start>
<end>85</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Probing Word Translations in the Transformer and Trading Decoder for Encoder Layers
%A Xu, Hongfei
%A van Genabith, Josef
%A Liu, Qiuhui
%A Xiong, Deyi
%Y Toutanova, Kristina
%Y Rumshisky, Anna
%Y Zettlemoyer, Luke
%Y Hakkani-Tur, Dilek
%Y Beltagy, Iz
%Y Bethard, Steven
%Y Cotterell, Ryan
%Y Chakraborty, Tanmoy
%Y Zhou, Yichao
%S Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F xu-etal-2021-probing
%X Due to its effectiveness and performance, the Transformer translation model has attracted wide attention, most recently in terms of probing-based approaches. Previous work focuses on using or probing source linguistic features in the encoder. To date, the way word translation evolves in Transformer layers has not yet been investigated. Naively, one might assume that encoder layers capture source information while decoder layers translate. In this work, we show that this is not quite the case: translation already happens progressively in encoder layers and even in the input embeddings. More surprisingly, we find that some of the lower decoder layers do not actually do that much decoding. We show all of this in terms of a probing approach where we project representations of the layer analyzed to the final trained and frozen classifier level of the Transformer decoder to measure word translation accuracy. Our findings motivate and explain a Transformer configuration change: if translation already happens in the encoder layers, perhaps we can increase the number of encoder layers, while decreasing the number of decoder layers, boosting decoding speed, without loss in translation quality? Our experiments show that this is indeed the case: we can increase speed by up to a factor 2.3 with small gains in translation quality, while an 18-4 deep encoder configuration boosts translation quality by +1.42 BLEU (En-De) at a speed-up of 1.4.
%R 10.18653/v1/2021.naacl-main.7
%U https://aclanthology.org/2021.naacl-main.7/
%U https://doi.org/10.18653/v1/2021.naacl-main.7
%P 74-85
Markdown (Informal)
[Probing Word Translations in the Transformer and Trading Decoder for Encoder Layers](https://aclanthology.org/2021.naacl-main.7/) (Xu et al., NAACL 2021)
ACL