@article{mickus-etal-2022-dissect,
title = "How to Dissect a {M}uppet: The Structure of Transformer Embedding Spaces",
author = "Mickus, Timothee and
Paperno, Denis and
Constant, Mathieu",
editor = "Roark, Brian and
Nenkova, Ani",
journal = "Transactions of the Association for Computational Linguistics",
volume = "10",
year = "2022",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2022.tacl-1.57",
doi = "10.1162/tacl_a_00501",
pages = "981--996",
abstract = "Pretrained embeddings based on the Transformer architecture have taken the NLP community by storm. We show that they can mathematically be reframed as a sum of vector factors and showcase how to use this reframing to study the impact of each component. We provide evidence that multi-head attentions and feed-forwards are not equally useful in all downstream applications, as well as a quantitative overview of the effects of finetuning on the overall embedding space. This approach allows us to draw connections to a wide range of previous studies, from vector space anisotropy to attention weights.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mickus-etal-2022-dissect">
<titleInfo>
<title>How to Dissect a Muppet: The Structure of Transformer Embedding Spaces</title>
</titleInfo>
<name type="personal">
<namePart type="given">Timothee</namePart>
<namePart type="family">Mickus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Denis</namePart>
<namePart type="family">Paperno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mathieu</namePart>
<namePart type="family">Constant</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Pretrained embeddings based on the Transformer architecture have taken the NLP community by storm. We show that they can mathematically be reframed as a sum of vector factors and showcase how to use this reframing to study the impact of each component. We provide evidence that multi-head attentions and feed-forwards are not equally useful in all downstream applications, as well as a quantitative overview of the effects of finetuning on the overall embedding space. This approach allows us to draw connections to a wide range of previous studies, from vector space anisotropy to attention weights.</abstract>
<identifier type="citekey">mickus-etal-2022-dissect</identifier>
<identifier type="doi">10.1162/tacl_a_00501</identifier>
<location>
<url>https://aclanthology.org/2022.tacl-1.57</url>
</location>
<part>
<date>2022</date>
<detail type="volume"><number>10</number></detail>
<extent unit="page">
<start>981</start>
<end>996</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T How to Dissect a Muppet: The Structure of Transformer Embedding Spaces
%A Mickus, Timothee
%A Paperno, Denis
%A Constant, Mathieu
%J Transactions of the Association for Computational Linguistics
%D 2022
%V 10
%I MIT Press
%C Cambridge, MA
%F mickus-etal-2022-dissect
%X Pretrained embeddings based on the Transformer architecture have taken the NLP community by storm. We show that they can mathematically be reframed as a sum of vector factors and showcase how to use this reframing to study the impact of each component. We provide evidence that multi-head attentions and feed-forwards are not equally useful in all downstream applications, as well as a quantitative overview of the effects of finetuning on the overall embedding space. This approach allows us to draw connections to a wide range of previous studies, from vector space anisotropy to attention weights.
%R 10.1162/tacl_a_00501
%U https://aclanthology.org/2022.tacl-1.57
%U https://doi.org/10.1162/tacl_a_00501
%P 981-996
Markdown (Informal)
[How to Dissect a Muppet: The Structure of Transformer Embedding Spaces](https://aclanthology.org/2022.tacl-1.57) (Mickus et al., TACL 2022)
ACL