@inproceedings{vazquez-etal-2022-closer,
title = "A Closer Look at Parameter Contributions When Training Neural Language and Translation Models",
author = {V{\'a}zquez, Ra{\'u}l and
Celikkanat, Hande and
Ravishankar, Vinit and
Creutz, Mathias and
Tiedemann, J{\"o}rg},
booktitle = "Proceedings of the 29th International Conference on Computational Linguistics",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2022.coling-1.424",
pages = "4788--4800",
abstract = "We analyze the learning dynamics of neural language and translation models using Loss Change Allocation (LCA), an indicator that enables a fine-grained analysis of parameter updates when optimizing for the loss function. In other words, we can observe the contributions of different network components at training time. In this article, we systematically study masked language modeling, causal language modeling, and machine translation. We show that the choice of training objective leads to distinctive optimization procedures, even when performed on comparable Transformer architectures. We demonstrate how the various Transformer parameters are used during training, supporting that the feed-forward components of each layer are the main contributors to the optimization procedure. Finally, we find that the learning dynamics are not affected by data size and distribution but rather determined by the learning objective.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vazquez-etal-2022-closer">
<titleInfo>
<title>A Closer Look at Parameter Contributions When Training Neural Language and Translation Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Raúl</namePart>
<namePart type="family">Vázquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hande</namePart>
<namePart type="family">Celikkanat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vinit</namePart>
<namePart type="family">Ravishankar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mathias</namePart>
<namePart type="family">Creutz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 29th International Conference on Computational Linguistics</title>
</titleInfo>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We analyze the learning dynamics of neural language and translation models using Loss Change Allocation (LCA), an indicator that enables a fine-grained analysis of parameter updates when optimizing for the loss function. In other words, we can observe the contributions of different network components at training time. In this article, we systematically study masked language modeling, causal language modeling, and machine translation. We show that the choice of training objective leads to distinctive optimization procedures, even when performed on comparable Transformer architectures. We demonstrate how the various Transformer parameters are used during training, supporting that the feed-forward components of each layer are the main contributors to the optimization procedure. Finally, we find that the learning dynamics are not affected by data size and distribution but rather determined by the learning objective.</abstract>
<identifier type="citekey">vazquez-etal-2022-closer</identifier>
<location>
<url>https://aclanthology.org/2022.coling-1.424</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>4788</start>
<end>4800</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Closer Look at Parameter Contributions When Training Neural Language and Translation Models
%A Vázquez, Raúl
%A Celikkanat, Hande
%A Ravishankar, Vinit
%A Creutz, Mathias
%A Tiedemann, Jörg
%S Proceedings of the 29th International Conference on Computational Linguistics
%D 2022
%8 October
%I International Committee on Computational Linguistics
%C Gyeongju, Republic of Korea
%F vazquez-etal-2022-closer
%X We analyze the learning dynamics of neural language and translation models using Loss Change Allocation (LCA), an indicator that enables a fine-grained analysis of parameter updates when optimizing for the loss function. In other words, we can observe the contributions of different network components at training time. In this article, we systematically study masked language modeling, causal language modeling, and machine translation. We show that the choice of training objective leads to distinctive optimization procedures, even when performed on comparable Transformer architectures. We demonstrate how the various Transformer parameters are used during training, supporting that the feed-forward components of each layer are the main contributors to the optimization procedure. Finally, we find that the learning dynamics are not affected by data size and distribution but rather determined by the learning objective.
%U https://aclanthology.org/2022.coling-1.424
%P 4788-4800
Markdown (Informal)
[A Closer Look at Parameter Contributions When Training Neural Language and Translation Models](https://aclanthology.org/2022.coling-1.424) (Vázquez et al., COLING 2022)
ACL