@inproceedings{yang-etal-2020-sub,
title = "On the Sub-layer Functionalities of Transformer Decoder",
author = "Yang, Yilin and
Wang, Longyue and
Shi, Shuming and
Tadepalli, Prasad and
Lee, Stefan and
Tu, Zhaopeng",
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.findings-emnlp.432",
doi = "10.18653/v1/2020.findings-emnlp.432",
pages = "4799--4811",
abstract = "There have been significant efforts to interpret the encoder of Transformer-based encoder-decoder architectures for neural machine translation (NMT); meanwhile, the decoder remains largely unexamined despite its critical role. During translation, the decoder must predict output tokens by considering both the source-language text from the encoder and the target-language prefix produced in previous steps. In this work, we study how Transformer-based decoders leverage information from the source and target languages {--} developing a universal probe task to assess how information is propagated through each module of each decoder layer. We perform extensive experiments on three major translation datasets (WMT En-De, En-Fr, and En-Zh). Our analysis provides insight on when and where decoders leverage different sources. Based on these insights, we demonstrate that the residual feed-forward module in each Transformer decoder layer can be dropped with minimal loss of performance {--} a significant reduction in computation and number of parameters, and consequently a significant boost to both training and inference speed.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yang-etal-2020-sub">
<titleInfo>
<title>On the Sub-layer Functionalities of Transformer Decoder</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yilin</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Longyue</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuming</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prasad</namePart>
<namePart type="family">Tadepalli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefan</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhaopeng</namePart>
<namePart type="family">Tu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2020</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>There have been significant efforts to interpret the encoder of Transformer-based encoder-decoder architectures for neural machine translation (NMT); meanwhile, the decoder remains largely unexamined despite its critical role. During translation, the decoder must predict output tokens by considering both the source-language text from the encoder and the target-language prefix produced in previous steps. In this work, we study how Transformer-based decoders leverage information from the source and target languages – developing a universal probe task to assess how information is propagated through each module of each decoder layer. We perform extensive experiments on three major translation datasets (WMT En-De, En-Fr, and En-Zh). Our analysis provides insight on when and where decoders leverage different sources. Based on these insights, we demonstrate that the residual feed-forward module in each Transformer decoder layer can be dropped with minimal loss of performance – a significant reduction in computation and number of parameters, and consequently a significant boost to both training and inference speed.</abstract>
<identifier type="citekey">yang-etal-2020-sub</identifier>
<identifier type="doi">10.18653/v1/2020.findings-emnlp.432</identifier>
<location>
<url>https://aclanthology.org/2020.findings-emnlp.432</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>4799</start>
<end>4811</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Sub-layer Functionalities of Transformer Decoder
%A Yang, Yilin
%A Wang, Longyue
%A Shi, Shuming
%A Tadepalli, Prasad
%A Lee, Stefan
%A Tu, Zhaopeng
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Findings of the Association for Computational Linguistics: EMNLP 2020
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F yang-etal-2020-sub
%X There have been significant efforts to interpret the encoder of Transformer-based encoder-decoder architectures for neural machine translation (NMT); meanwhile, the decoder remains largely unexamined despite its critical role. During translation, the decoder must predict output tokens by considering both the source-language text from the encoder and the target-language prefix produced in previous steps. In this work, we study how Transformer-based decoders leverage information from the source and target languages – developing a universal probe task to assess how information is propagated through each module of each decoder layer. We perform extensive experiments on three major translation datasets (WMT En-De, En-Fr, and En-Zh). Our analysis provides insight on when and where decoders leverage different sources. Based on these insights, we demonstrate that the residual feed-forward module in each Transformer decoder layer can be dropped with minimal loss of performance – a significant reduction in computation and number of parameters, and consequently a significant boost to both training and inference speed.
%R 10.18653/v1/2020.findings-emnlp.432
%U https://aclanthology.org/2020.findings-emnlp.432
%U https://doi.org/10.18653/v1/2020.findings-emnlp.432
%P 4799-4811
Markdown (Informal)
[On the Sub-layer Functionalities of Transformer Decoder](https://aclanthology.org/2020.findings-emnlp.432) (Yang et al., Findings 2020)
ACL
- Yilin Yang, Longyue Wang, Shuming Shi, Prasad Tadepalli, Stefan Lee, and Zhaopeng Tu. 2020. On the Sub-layer Functionalities of Transformer Decoder. In Findings of the Association for Computational Linguistics: EMNLP 2020, pages 4799–4811, Online. Association for Computational Linguistics.