@inproceedings{simoulin-crabbe-2021-many,
title = "How Many Layers and Why? {A}n Analysis of the Model Depth in Transformers",
author = "Simoulin, Antoine and
Crabb{\'e}, Benoit",
editor = "Kabbara, Jad and
Lin, Haitao and
Paullada, Amandalynne and
Vamvas, Jannis",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: Student Research Workshop",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.acl-srw.23",
doi = "10.18653/v1/2021.acl-srw.23",
pages = "221--228",
abstract = "In this study, we investigate the role of the multiple layers in deep transformer models. We design a variant of Albert that dynamically adapts the number of layers for each token of the input. The key specificity of Albert is that weights are tied across layers. Therefore, the stack of encoder layers iteratively repeats the application of the same transformation function on the input. We interpret the repetition of this application as an iterative process where the token contextualized representations are progressively refined. We analyze this process at the token level during pre-training, fine-tuning, and inference. We show that tokens do not require the same amount of iterations and that difficult or crucial tokens for the task are subject to more iterations.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="simoulin-crabbe-2021-many">
<titleInfo>
<title>How Many Layers and Why? An Analysis of the Model Depth in Transformers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Antoine</namePart>
<namePart type="family">Simoulin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benoit</namePart>
<namePart type="family">Crabbé</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jad</namePart>
<namePart type="family">Kabbara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haitao</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amandalynne</namePart>
<namePart type="family">Paullada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jannis</namePart>
<namePart type="family">Vamvas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this study, we investigate the role of the multiple layers in deep transformer models. We design a variant of Albert that dynamically adapts the number of layers for each token of the input. The key specificity of Albert is that weights are tied across layers. Therefore, the stack of encoder layers iteratively repeats the application of the same transformation function on the input. We interpret the repetition of this application as an iterative process where the token contextualized representations are progressively refined. We analyze this process at the token level during pre-training, fine-tuning, and inference. We show that tokens do not require the same amount of iterations and that difficult or crucial tokens for the task are subject to more iterations.</abstract>
<identifier type="citekey">simoulin-crabbe-2021-many</identifier>
<identifier type="doi">10.18653/v1/2021.acl-srw.23</identifier>
<location>
<url>https://aclanthology.org/2021.acl-srw.23</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>221</start>
<end>228</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How Many Layers and Why? An Analysis of the Model Depth in Transformers
%A Simoulin, Antoine
%A Crabbé, Benoit
%Y Kabbara, Jad
%Y Lin, Haitao
%Y Paullada, Amandalynne
%Y Vamvas, Jannis
%S Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: Student Research Workshop
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F simoulin-crabbe-2021-many
%X In this study, we investigate the role of the multiple layers in deep transformer models. We design a variant of Albert that dynamically adapts the number of layers for each token of the input. The key specificity of Albert is that weights are tied across layers. Therefore, the stack of encoder layers iteratively repeats the application of the same transformation function on the input. We interpret the repetition of this application as an iterative process where the token contextualized representations are progressively refined. We analyze this process at the token level during pre-training, fine-tuning, and inference. We show that tokens do not require the same amount of iterations and that difficult or crucial tokens for the task are subject to more iterations.
%R 10.18653/v1/2021.acl-srw.23
%U https://aclanthology.org/2021.acl-srw.23
%U https://doi.org/10.18653/v1/2021.acl-srw.23
%P 221-228
Markdown (Informal)
[How Many Layers and Why? An Analysis of the Model Depth in Transformers](https://aclanthology.org/2021.acl-srw.23) (Simoulin & Crabbé, ACL-IJCNLP 2021)
ACL
- Antoine Simoulin and Benoit Crabbé. 2021. How Many Layers and Why? An Analysis of the Model Depth in Transformers. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: Student Research Workshop, pages 221–228, Online. Association for Computational Linguistics.