@inproceedings{wang-tu-2020-rethinking,
title = "Rethinking the Value of Transformer Components",
author = "Wang, Wenxuan and
Tu, Zhaopeng",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.529/",
doi = "10.18653/v1/2020.coling-main.529",
pages = "6019--6029",
abstract = "Transformer becomes the state-of-the-art translation model, while it is not well studied how each intermediate component contributes to the model performance, which poses significant challenges for designing optimal architectures. In this work, we bridge this gap by evaluating the impact of individual component (sub-layer) in trained Transformer models from different perspectives. Experimental results across language pairs, training strategies, and model capacities show that certain components are consistently more important than the others. We also report a number of interesting findings that might help humans better analyze, understand and improve Transformer models. Based on these observations, we further propose a new training strategy that can improves translation performance by distinguishing the unimportant components in training."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-tu-2020-rethinking">
<titleInfo>
<title>Rethinking the Value of Transformer Components</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wenxuan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhaopeng</namePart>
<namePart type="family">Tu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donia</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Transformer becomes the state-of-the-art translation model, while it is not well studied how each intermediate component contributes to the model performance, which poses significant challenges for designing optimal architectures. In this work, we bridge this gap by evaluating the impact of individual component (sub-layer) in trained Transformer models from different perspectives. Experimental results across language pairs, training strategies, and model capacities show that certain components are consistently more important than the others. We also report a number of interesting findings that might help humans better analyze, understand and improve Transformer models. Based on these observations, we further propose a new training strategy that can improves translation performance by distinguishing the unimportant components in training.</abstract>
<identifier type="citekey">wang-tu-2020-rethinking</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.529</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.529/</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>6019</start>
<end>6029</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Rethinking the Value of Transformer Components
%A Wang, Wenxuan
%A Tu, Zhaopeng
%Y Scott, Donia
%Y Bel, Nuria
%Y Zong, Chengqing
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F wang-tu-2020-rethinking
%X Transformer becomes the state-of-the-art translation model, while it is not well studied how each intermediate component contributes to the model performance, which poses significant challenges for designing optimal architectures. In this work, we bridge this gap by evaluating the impact of individual component (sub-layer) in trained Transformer models from different perspectives. Experimental results across language pairs, training strategies, and model capacities show that certain components are consistently more important than the others. We also report a number of interesting findings that might help humans better analyze, understand and improve Transformer models. Based on these observations, we further propose a new training strategy that can improves translation performance by distinguishing the unimportant components in training.
%R 10.18653/v1/2020.coling-main.529
%U https://aclanthology.org/2020.coling-main.529/
%U https://doi.org/10.18653/v1/2020.coling-main.529
%P 6019-6029
Markdown (Informal)
[Rethinking the Value of Transformer Components](https://aclanthology.org/2020.coling-main.529/) (Wang & Tu, COLING 2020)
ACL
- Wenxuan Wang and Zhaopeng Tu. 2020. Rethinking the Value of Transformer Components. In Proceedings of the 28th International Conference on Computational Linguistics, pages 6019–6029, Barcelona, Spain (Online). International Committee on Computational Linguistics.